This file is designed to use CDC data to assess coronavirus disease burden by state, including creating and analyzing state-level cluters.
Through March 7, 2021, The COVID Tracking Project collected and integrated data on tests, cases, hospitalizations, deaths, and the like by state and date. The latest code for using this data is available in Coronavirus_Statistics_CTP_v004.Rmd.
The COVID Tracking Project suggest that US federal data sources are now sufficiently robust to be used for analyses that previously relied on COVID Tracking Project. This code is an attempt to update modules in Coronavirus_Statistics_CTP_v004.Rmd to leverage US federal data.
The code in this module builds on code available in _v001, and splits many functions in to two main .R files that can be sourced:
Broadly, the CDC data analyzed by this module includes:
The tidyverse package is loaded and functions are sourced:
# The tidyverse functions are routinely used without package::function format
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
A series of mapping files are also available to allow for parameterized processing. Mappings include:
These default parameters are maintained in a separate .R file and can be sourced:
source("./Coronavirus_CDC_Daily_Default_Mappings_v002.R")
Additionally, a mapping file could be maintained to give default plotting labels to variables. This is currently not used by any of the sourced functions:
# Create a variable mapping file - this is currently redundant
varMapper <- c()
Code from the previous model is run, with results compared to previous results:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv"
)
cdc_daily_compare <- readRunCDCDaily(thruLabel="May 2, 2021",
readFrom=readList,
compareFile=list("cdcDaily"=colRenamer(readFromRDS("dfRaw_dc_210414"),
c('new_case'='new_cases',
'tot_death'='tot_deaths',
'new_death'='new_deaths'
)
),
"cdcHosp"=readFromRDS("dfHosp_old")
),
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log",
ovrwriteLog=TRUE,
dfPerCapita=NULL,
useClusters=readFromRDS("cdc_daily_test_v2")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current: naconf
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 18
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 97 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: previous_day_admission_adult_covid_confirmed_18-19 previous_day_admission_adult_covid_confirmed_18-19_coverage previous_day_admission_adult_covid_confirmed_20-29 previous_day_admission_adult_covid_confirmed_20-29_coverage previous_day_admission_adult_covid_confirmed_30-39 previous_day_admission_adult_covid_confirmed_30-39_coverage previous_day_admission_adult_covid_confirmed_40-49 previous_day_admission_adult_covid_confirmed_40-49_coverage previous_day_admission_adult_covid_confirmed_50-59 previous_day_admission_adult_covid_confirmed_50-59_coverage previous_day_admission_adult_covid_confirmed_60-69 previous_day_admission_adult_covid_confirmed_60-69_coverage previous_day_admission_adult_covid_confirmed_70-79 previous_day_admission_adult_covid_confirmed_70-79_coverage previous_day_admission_adult_covid_confirmed_80+ previous_day_admission_adult_covid_confirmed_80+_coverage previous_day_admission_adult_covid_confirmed_unknown previous_day_admission_adult_covid_confirmed_unknown_coverage previous_day_admission_adult_covid_suspected_18-19 previous_day_admission_adult_covid_suspected_18-19_coverage previous_day_admission_adult_covid_suspected_20-29 previous_day_admission_adult_covid_suspected_20-29_coverage previous_day_admission_adult_covid_suspected_30-39 previous_day_admission_adult_covid_suspected_30-39_coverage previous_day_admission_adult_covid_suspected_40-49 previous_day_admission_adult_covid_suspected_40-49_coverage previous_day_admission_adult_covid_suspected_50-59 previous_day_admission_adult_covid_suspected_50-59_coverage previous_day_admission_adult_covid_suspected_60-69 previous_day_admission_adult_covid_suspected_60-69_coverage previous_day_admission_adult_covid_suspected_70-79 previous_day_admission_adult_covid_suspected_70-79_coverage previous_day_admission_adult_covid_suspected_80+ previous_day_admission_adult_covid_suspected_80+_coverage previous_day_admission_adult_covid_suspected_unknown previous_day_admission_adult_covid_suspected_unknown_coverage
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 15
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 6 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 63 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 5.08e+9 1.07e+8 3.21e+7 558830 27435
## 2 after 5.06e+9 1.06e+8 3.19e+7 556355 23715
## 3 pctchg 4.40e-3 3.81e-3 4.47e-3 0.00443 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.57e+7 1.99e+7 436353 23230
## 2 after 2.56e+7 1.98e+7 426239 22395
## 3 pctchg 5.60e-3 5.66e-3 0.0232 0.0359
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
identical(cdc_daily_compare[c("stateData", "dfRaw", "dfProcess", "dfPerCapita", "useClusters")],
readFromRDS("cdc_daily_test_v3")[c("stateData", "dfRaw", "dfProcess", "dfPerCapita", "useClusters")]
)
## [1] TRUE
identical(cdc_daily_compare$plotDataList[c("dfFull", "dfAgg", "plotClusters")],
readFromRDS("cdc_daily_test_v3")$plotDataList[c("dfFull", "dfAgg", "plotClusters")]
)
## [1] TRUE
The core data elements are identical, and the plots appear to convey the same information. Next steps are to download the latest data and process with existing clusters.
Updated data are downloaded and processed, using existing segments. The downloadTo argument is edited using lapply to avoid downloading data if it has previously been downloaded:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210528.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210528.csv"
)
compareList <- list("cdcDaily"=readFromRDS("cdc_daily_test_v3")$dfRaw$cdcDaily,
"cdcHosp"=readFromRDS("cdc_daily_test_v3")$dfRaw$cdcHosp
)
cdc_daily_210528 <- readRunCDCDaily(thruLabel="May 28, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log",
useClusters=readFromRDS("cdc_daily_test_v2")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 26
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 593 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 39 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 3 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 49 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 5.99e+9 1.24e+8 3.29e+7 577667 28969
## 2 after 5.96e+9 1.23e+8 3.28e+7 575010 25041
## 3 pctchg 4.37e-3 3.82e-3 4.55e-3 0.00460 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.61e+7 2.03e+7 415621 23972
## 2 after 2.60e+7 2.02e+7 405188 23109
## 3 pctchg 5.67e-3 5.73e-3 0.0251 0.0360
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
saveToRDS(cdc_daily_210528, ovrWrite=FALSE, ovrWriteError=FALSE)
The process appears to work as intended. Next steps are to update the county-level data process, making use of some of the functions available for CDC data processing.
The latest version of the data are downloaded and processed:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210708.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210708.csv"
)
compareList <- list("cdcDaily"=readFromRDS("cdc_daily_210528")$dfRaw$cdcDaily,
"cdcHosp"=readFromRDS("cdc_daily_210528")$dfRaw$cdcHosp
)
cdc_daily_210708 <- readRunCDCDaily(thruLabel="Jul 08, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log",
useClusters=readFromRDS("cdc_daily_210528")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 40
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 432 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 43 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: deaths_covid deaths_covid_coverage
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 42
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 3 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 57 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 7.32e+9 1.49e+8 3.35e+7 596979 31329
## 2 after 7.29e+9 1.48e+8 3.33e+7 594255 27081
## 3 pctchg 4.40e-3 3.91e-3 4.57e-3 0.00456 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.70e+7 2.11e+7 447142 26198
## 2 after 2.69e+7 2.10e+7 435737 25251
## 3 pctchg 5.65e-3 5.67e-3 0.0255 0.0361
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
saveToRDS(cdc_daily_210708, ovrWrite=FALSE, ovrWriteError=FALSE)
Vaccines data are also available for download on the CDC website:
urlVaccine <- "https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD"
locVaccine <- "./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv"
fileDownload(locVaccine, urlVaccine)
## size isdir mode
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 4270315 FALSE 666
## mtime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:36
## ctime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:11
## atime exe
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:36 no
The file has many fields, including:
An individual can live in one state but be vaccinated in another state. Per the CDC field descriptions:
Fully vaccinated (series complete) metrics is defined as “Total number of people who are fully vaccinated (have second dose of a two-dose vaccine or one dose of a single-dose vaccine) based on the jurisdiction where recipient lives”
vaxRaw_210712 <- fileRead(locVaccine)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
glimpse(vaxRaw_210712)
## Rows: 13,618
## Columns: 69
## $ Date <chr> "07/11/2021", "07/11/2021", "07~
## $ MMWR_week <dbl> 28, 28, 28, 28, 28, 28, 28, 28,~
## $ Location <chr> "FL", "IA", "WI", "MO", "ND", "~
## $ Distributed <dbl> 25229075, 3506895, 6207245, 620~
## $ Distributed_Janssen <dbl> 1694500, 188700, 318700, 311400~
## $ Distributed_Moderna <dbl> 10217260, 1460040, 2633920, 254~
## $ Distributed_Pfizer <dbl> 13317315, 1858155, 3254625, 334~
## $ Distributed_Unk_Manuf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Dist_Per_100K <dbl> 117466, 111151, 106609, 101065,~
## $ Distributed_Per_100k_12Plus <dbl> 134944, 131036, 124150, 118471,~
## $ Distributed_Per_100k_18Plus <dbl> 146274, 144422, 136248, 130124,~
## $ Distributed_Per_100k_65Plus <dbl> 560978, 634211, 610203, 584049,~
## $ Administered <dbl> 21527263, 3073527, 6017859, 520~
## $ Administered_12Plus <dbl> 21519017, 3073495, 6017495, 520~
## $ Administered_18Plus <dbl> 20764735, 2932330, 5732822, 499~
## $ Administered_65Plus <dbl> 7906498, 959982, 1837143, 16454~
## $ Administered_Janssen <dbl> 1048774, 128869, 240681, 174610~
## $ Administered_Moderna <dbl> 8579143, 1297742, 2470502, 1977~
## $ Administered_Pfizer <dbl> 11846137, 1646788, 3306004, 305~
## $ Administered_Unk_Manuf <dbl> 53209, 128, 672, 466, 0, 2580, ~
## $ Administered_Fed_LTC <dbl> 405647, 138684, 182382, 158723,~
## $ Administered_Fed_LTC_Residents <dbl> 209000, 62049, 85961, 85652, 30~
## $ Administered_Fed_LTC_Staff <dbl> 119292, 45853, 59621, 49923, 22~
## $ Administered_Fed_LTC_Unk <dbl> 77355, 30782, 36800, 23148, 137~
## $ Administered_Fed_LTC_Dose1 <dbl> 230126, 87469, 115893, 93047, 3~
## $ Administered_Fed_LTC_Dose1_Residents <dbl> 117587, 35533, 50724, 48321, 15~
## $ Administered_Fed_LTC_Dose1_Staff <dbl> 67708, 28547, 36168, 29112, 117~
## $ Administered_Fed_LTC_Dose1_Unk <dbl> 44831, 23389, 29001, 15614, 811~
## $ Admin_Per_100K <dbl> 100231, 97415, 103356, 84885, 8~
## $ Admin_Per_100k_12Plus <dbl> 115099, 114842, 120355, 99498, ~
## $ Admin_Per_100k_18Plus <dbl> 120391, 120760, 125835, 104872,~
## $ Admin_Per_100k_65Plus <dbl> 175804, 173610, 180600, 154933,~
## $ Recip_Administered <dbl> 21237913, 3069562, 5974955, 511~
## $ Administered_Dose1_Recip <dbl> 11763654, 1638173, 3163125, 281~
## $ Administered_Dose1_Pop_Pct <dbl> 54.8, 51.9, 54.3, 45.9, 44.4, 5~
## $ Administered_Dose1_Recip_12Plus <dbl> 11756137, 1638108, 3162679, 281~
## $ Administered_Dose1_Recip_12PlusPop_Pct <dbl> 62.9, 61.2, 63.3, 53.8, 53.0, 6~
## $ Administered_Dose1_Recip_18Plus <dbl> 11323495, 1562036, 3007052, 269~
## $ Administered_Dose1_Recip_18PlusPop_Pct <dbl> 65.7, 64.3, 66.0, 56.6, 56.0, 7~
## $ Administered_Dose1_Recip_65Plus <dbl> 4061097, 490657, 920145, 859645~
## $ Administered_Dose1_Recip_65PlusPop_Pct <dbl> 90.3, 88.7, 90.5, 80.9, 83.5, 8~
## $ Series_Complete_Yes <dbl> 10086805, 1537214, 2951037, 243~
## $ Series_Complete_Pop_Pct <dbl> 47.0, 48.7, 50.7, 39.7, 39.4, 5~
## $ Series_Complete_12Plus <dbl> 10085351, 1537191, 2950892, 243~
## $ Series_Complete_12PlusPop_Pct <dbl> 53.9, 57.4, 59.0, 46.6, 47.0, 6~
## $ Series_Complete_18Plus <dbl> 9776152, 1473385, 2825253, 2353~
## $ Series_Complete_18PlusPop_Pct <dbl> 56.7, 60.7, 62.0, 49.4, 50.0, 6~
## $ Series_Complete_65Plus <dbl> 3551211, 475114, 889344, 779851~
## $ Series_Complete_65PlusPop_Pct <dbl> 79.0, 85.9, 87.4, 73.4, 74.5, 8~
## $ Series_Complete_Janssen <dbl> 1031811, 126334, 232849, 175144~
## $ Series_Complete_Moderna <dbl> 3807918, 629990, 1161367, 90416~
## $ Series_Complete_Pfizer <dbl> 5229909, 780797, 1556520, 13597~
## $ Series_Complete_Unk_Manuf <dbl> 17167, 93, 301, 101, 1, 792, 75~
## $ Series_Complete_Janssen_12Plus <dbl> 1031093, 126332, 232832, 175129~
## $ Series_Complete_Moderna_12Plus <dbl> 3807322, 629983, 1161353, 90415~
## $ Series_Complete_Pfizer_12Plus <dbl> 5229769, 780783, 1556406, 13597~
## $ Series_Complete_Unk_Manuf_12Plus <dbl> 17167, 93, 301, 101, 1, 792, 74~
## $ Series_Complete_Janssen_18Plus <dbl> 1030595, 126273, 232707, 174990~
## $ Series_Complete_Moderna_18Plus <dbl> 3806853, 629858, 1161109, 90392~
## $ Series_Complete_Pfizer_18Plus <dbl> 4921576, 717161, 1431144, 12746~
## $ Series_Complete_Unk_Manuf_18Plus <dbl> 17128, 93, 293, 93, 1, 781, 742~
## $ Series_Complete_Janssen_65Plus <dbl> 179075, 11728, 24812, 33179, 35~
## $ Series_Complete_Moderna_65Plus <dbl> 1755611, 252070, 432381, 357176~
## $ Series_Complete_Pfizer_65Plus <dbl> 1604988, 211256, 432022, 389451~
## $ Series_Complete_Unk_Manuf_65Plus <dbl> 11537, 60, 129, 45, 0, 464, 326~
## $ Series_Complete_FedLTC <dbl> 174063, 50507, 65859, 65388, 30~
## $ Series_Complete_FedLTC_Residents <dbl> 89676, 26063, 34733, 36971, 141~
## $ Series_Complete_FedLTC_Staff <dbl> 50661, 16950, 23251, 20660, 105~
## $ Series_Complete_FedLTC_Unknown <dbl> 33726, 7494, 7875, 7757, 552, 9~
vaxRenamer <- c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
)
vaxKeeper <- c("state", "date", "MMWR_week",
"Administered", "Administered_12Plus", "Administered_18Plus", "Administered_65Plus",
"Admin_Per_100k", "Admin_Per_100k_12Plus", "Admin_Per_100k_18Plus", "Admin_Per_100k_65Plus",
"Recip_Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus", "Series_Complete_18Plus", "Series_Complete_65Plus",
"Series_Complete_Pop_Pct",
"Series_Complete_12PlusPop_Pct", "Series_Complete_18PlusPop_Pct", "Series_Complete_65PlusPop_Pct"
)
vaxProcessed_210712 <- vaxRaw_210712 %>%
colRenamer(vecRename=vaxRenamer) %>%
colSelector(vecSelect=vaxKeeper) %>%
colMutater(selfList=list("date"=lubridate::mdy))
glimpse(vaxProcessed_210712)
## Rows: 13,618
## Columns: 20
## $ state <chr> "FL", "IA", "WI", "MO", "ND", "VA", "US"~
## $ date <date> 2021-07-11, 2021-07-11, 2021-07-11, 202~
## $ MMWR_week <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, ~
## $ Administered <dbl> 21527263, 3073527, 6017859, 5209747, 649~
## $ Administered_12Plus <dbl> 21519017, 3073495, 6017495, 5209418, 648~
## $ Administered_18Plus <dbl> 20764735, 2932330, 5732822, 4999072, 626~
## $ Administered_65Plus <dbl> 7906498, 959982, 1837143, 1645444, 19396~
## $ Admin_Per_100k <dbl> 100231, 97415, 103356, 84885, 85282, 110~
## $ Admin_Per_100k_12Plus <dbl> 115099, 114842, 120355, 99498, 101825, 1~
## $ Admin_Per_100k_18Plus <dbl> 120391, 120760, 125835, 104872, 107725, ~
## $ Admin_Per_100k_65Plus <dbl> 175804, 173610, 180600, 154933, 161847, ~
## $ Recip_Administered <dbl> 21237913, 3069562, 5974955, 5114570, 618~
## $ Series_Complete_Yes <dbl> 10086805, 1537214, 2951037, 2439175, 300~
## $ Series_Complete_12Plus <dbl> 10085351, 1537191, 2950892, 2439129, 299~
## $ Series_Complete_18Plus <dbl> 9776152, 1473385, 2825253, 2353696, 2909~
## $ Series_Complete_65Plus <dbl> 3551211, 475114, 889344, 779851, 89281, ~
## $ Series_Complete_Pop_Pct <dbl> 47.0, 48.7, 50.7, 39.7, 39.4, 52.9, 48.0~
## $ Series_Complete_12PlusPop_Pct <dbl> 53.9, 57.4, 59.0, 46.6, 47.0, 61.6, 56.1~
## $ Series_Complete_18PlusPop_Pct <dbl> 56.7, 60.7, 62.0, 49.4, 50.0, 63.7, 58.8~
## $ Series_Complete_65PlusPop_Pct <dbl> 79.0, 85.9, 87.4, 73.4, 74.5, 81.4, 79.0~
Counts by state are created:
vaxState <- vaxProcessed_210712 %>%
group_by(state) %>%
filter(date==max(date)) %>%
select(state, date, Administered, Recip_Administered, Series_Complete_Yes) %>%
ungroup() %>%
arrange(-Administered)
vaxState
## # A tibble: 65 x 5
## state date Administered Recip_Administered Series_Complete_Yes
## <chr> <date> <dbl> <dbl> <dbl>
## 1 US 2021-07-11 334151648 334151648 159266536
## 2 CA 2021-07-11 43609176 43607956 20176353
## 3 TX 2021-07-11 26245668 25536886 12230164
## 4 NY 2021-07-11 22233988 22166452 10763740
## 5 FL 2021-07-11 21527263 21237913 10086805
## 6 PA 2021-07-11 14126934 14159474 6486641
## 7 IL 2021-07-11 13206252 13344907 5971607
## 8 OH 2021-07-11 10835735 10710147 5318622
## 9 NJ 2021-07-11 10029522 10332551 5006341
## 10 MI 2021-07-11 9562802 9766213 4780127
## # ... with 55 more rows
vaxState %>%
filter(!(state %in% c(state.abb, "DC")))
## # A tibble: 14 x 5
## state date Administered Recip_Administered Series_Complete_Yes
## <chr> <date> <dbl> <dbl> <dbl>
## 1 US 2021-07-11 334151648 334151648 159266536
## 2 LTC 2021-07-11 7899665 0 0
## 3 VA2 2021-07-11 5381413 5381413 2706838
## 4 DD2 2021-07-11 4382578 4382578 1888769
## 5 PR 2021-07-11 3832854 3860036 1839207
## 6 IH2 2021-07-11 1459669 1459669 668566
## 7 BP2 2021-07-11 197049 197049 97863
## 8 GU 2021-07-11 194248 194467 93628
## 9 VI 2021-07-11 79692 77067 35899
## 10 MP 2021-07-11 57308 57358 27509
## 11 FM 2021-07-11 51997 52375 26444
## 12 AS 2021-07-11 48178 48436 21997
## 13 MH 2021-07-11 34127 34184 16365
## 14 RP 2021-07-11 25416 25637 13284
vaxState %>%
filter(!(state == "US")) %>%
mutate(pctComplete=Series_Complete_Yes/sum(Series_Complete_Yes)) %>%
mutate(is50DC=state %in% c(state.abb, "DC")) %>%
group_by(is50DC) %>%
summarize(n=n(), across(where(is.numeric), sum), .groups="drop")
## # A tibble: 2 x 6
## is50DC n Administered Recip_Administered Series_Complete_Yes pctComplete
## <lgl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 13 23644194 15770269 7436369 0.0455
## 2 TRUE 51 328226539 327261454 156057188 0.955
vaxProcessed_210712 %>%
filter(state=="US") %>%
select(state, date, Administered, Recip_Administered, Series_Complete_Yes) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=name, color=name)) +
labs(x="", y="Number of Doses/People (millions)", title="All-US Vaccination totals")
Roughly 5% of completely vaccinated individuals are tracked to entities that do not map back to states. These will be deleted for further analysis, which may lead to some disconnects.
Next steps are to continue processing the data and to integrate with the other state-level metrics.
Implied populations and vaccinations by subgroup are calculated:
vaxImplied_210712 <- vaxProcessed_210712 %>%
mutate(popTot=100*Series_Complete_Yes/Series_Complete_Pop_Pct,
pop65Plus=100*Series_Complete_65Plus/Series_Complete_65PlusPop_Pct,
pop18Plus=100*Series_Complete_18Plus/Series_Complete_18PlusPop_Pct,
pop12Plus=100*Series_Complete_12Plus/Series_Complete_12PlusPop_Pct,
pop1864=pop18Plus-pop65Plus,
pop1217=pop12Plus-pop18Plus,
pop0011=popTot-pop12Plus,
vax65Plus=Series_Complete_65Plus,
vax1864=Series_Complete_18Plus-Series_Complete_65Plus,
vax1217=Series_Complete_12Plus-Series_Complete_18Plus,
vax0011=Series_Complete_Yes-Series_Complete_12Plus
)
popData <- vaxImplied_210712 %>%
filter(state %in% c(state.abb, "DC", "PR", "US")) %>%
group_by(state) %>%
summarize(across(.cols=c(pop65Plus, pop1864, pop1217, pop0011),
.fns=list(mu=~mean(.x, na.rm=TRUE),
sdmu=~sd(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE),
rangemu=~diff(range(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE))
)
),
.groups="drop"
)
popData %>%
select(state, contains("_rangemu")) %>%
pivot_longer(-state) %>%
ggplot(aes(x=fct_reorder(state, value, .fun=max), y=value)) +
geom_point() +
coord_flip() +
facet_wrap(~name, nrow=1) +
labs(y="Range divided by mean", x=NULL, title="Consistency of population estimates by subgroup and state")
## Warning: Removed 2 rows containing missing values (geom_point).
popData %>%
select(state, contains("_mu")) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pct65Plus=sum(ifelse(name=="pop65Plus_mu", value, 0))/sum(value)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(state, pct65Plus), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
scale_fill_discrete("") +
labs(y="Proportion of population", x=NULL, title="Population breakout by state")
## Warning: Removed 4 rows containing missing values (geom_col).
vaxImplied_210712 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
vaxImplied_210712 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(eq0=(value==0), lt0=(value<0)) %>%
filter(value<=0) %>%
group_by(eq0, lt0, name) %>%
summarize(across(date, .fns=list(min=min, max=max)), .groups="drop")
## # A tibble: 5 x 5
## eq0 lt0 name date_min date_max
## <lgl> <lgl> <chr> <date> <date>
## 1 FALSE TRUE vax1217 2021-03-05 2021-05-12
## 2 TRUE FALSE vax0011 2020-12-13 2021-03-04
## 3 TRUE FALSE vax1217 2020-12-13 2021-03-04
## 4 TRUE FALSE vax1864 2020-12-13 2021-03-04
## 5 TRUE FALSE vax65Plus 2020-12-13 2021-03-04
Population estimates are generally consistent by state across dates, with the greatest variability in the 12-17 age estimates (expected since it is the smallest group where rounded percent vaccinated would have the most impact).
Distributions by age and state appear reasonable.
There has clearly been a change in tracking where fully vaccinated are tracked using age buckets:
Next steps are to modify code so that subtotal statistics by age bucket are used only when where appropriate.
The availability of fields for state ‘US’ (full nation) is explored:
vaxProcessed_210712 %>%
filter(state=="US") %>%
pivot_longer(-c(state, date)) %>%
mutate(valType=case_when(value < 0 ~ "red", value==0 ~ "orange", value > 0 ~ "green")) %>%
ggplot(aes(x=date, y=fct_reorder(name, valType=="green", .fun=sum), fill=valType)) +
geom_tile() +
scale_fill_identity() +
labs(x=NULL, y=NULL, title="Data availability by metric", subtitle="Red is negative, orange is zero")
In the early months, data are available only for administration. The “series complete” metrics are introduced later, with the 12Plus bucket added even later as authorizations for use in ages 12-17 were added.
A comparison of states/DC to US is made for each of the key metrics:
vaxProcessed_210712 %>%
mutate(stateType=case_when(state=="US" ~ "US", state %in% c(state.abb, "DC") ~ "state/DC", TRUE ~ "other")) %>%
group_by(stateType, date, MMWR_week) %>%
summarize(across(where(is.numeric), .fns=sum), .groups="drop") %>%
pivot_longer(-c(stateType, date, MMWR_week)) %>%
filter(!(str_detect(name, "Per|Pct"))) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=stateType, color=stateType)) +
facet_wrap(~name, scales="free_y")
In general, the sum of the states and DC are close to the total for US. Per capita and percentage metrics cannot be summed and were not compared directly.
Next steps are to adapt the population splits to account for the variable timing of initial data availability A heuristic can likely be used for the split of 65Plus in the early days, with 12Plus and 18Plus assumed to be equal (no usage in 0-17 group) prior to age being broken out.
An assumption is made that Series_Complete_Yes maps to the oldest group still left to populate when data breakouts are incomplete:
vaxImplied_210712_v2 <- vaxProcessed_210712 %>%
mutate(popTot=100*Series_Complete_Yes/Series_Complete_Pop_Pct,
pop65Plus=100*Series_Complete_65Plus/Series_Complete_65PlusPop_Pct,
pop18Plus=100*Series_Complete_18Plus/Series_Complete_18PlusPop_Pct,
pop12Plus=100*Series_Complete_12Plus/Series_Complete_12PlusPop_Pct,
pop1864=pop18Plus-pop65Plus,
pop1217=pop12Plus-pop18Plus,
pop0011=popTot-pop12Plus,
vax65Plus=Series_Complete_65Plus,
vax1864=Series_Complete_18Plus-Series_Complete_65Plus,
vax1217=ifelse(Series_Complete_12Plus>0, Series_Complete_12Plus, Series_Complete_Yes)-Series_Complete_18Plus,
vax0011=Series_Complete_Yes-vax65Plus-vax1864-vax1217
)
popData_v2 <- vaxImplied_210712_v2 %>%
filter(state %in% c(state.abb, "DC", "US")) %>%
group_by(state) %>%
summarize(across(.cols=c(popTot, pop65Plus, pop1864, pop1217, pop0011),
.fns=list(mu=~mean(.x, na.rm=TRUE),
sdmu=~sd(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE),
rangemu=~diff(range(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE))
)
),
.groups="drop"
)
popData_v2 %>%
select(state, contains("_rangemu")) %>%
pivot_longer(-state) %>%
ggplot(aes(x=fct_reorder(state, value, .fun=max), y=value)) +
geom_point() +
coord_flip() +
facet_wrap(~name, nrow=1) +
labs(y="Range divided by mean",
x=NULL,
title="Consistency of population estimates by subgroup and state"
)
popData_v2 %>%
select(state, contains("_mu"), -contains("popTot")) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pct65Plus=sum(ifelse(name=="pop65Plus_mu", value, 0))/sum(value)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(state, pct65Plus), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
scale_fill_discrete("") +
labs(y="Proportion of population", x=NULL, title="Population breakout by state")
vaxImplied_210712_v2 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
vaxImplied_210712_v2 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(eq0=(value==0), lt0=(value<0)) %>%
filter(value<=0) %>%
group_by(eq0, lt0, name) %>%
summarize(across(date, .fns=list(min=min, max=max)), .groups="drop")
## # A tibble: 4 x 5
## eq0 lt0 name date_min date_max
## <lgl> <lgl> <chr> <date> <date>
## 1 TRUE FALSE vax0011 2020-12-13 2021-05-12
## 2 TRUE FALSE vax1217 2020-12-13 2021-03-04
## 3 TRUE FALSE vax1864 2020-12-13 2021-03-04
## 4 TRUE FALSE vax65Plus 2020-12-13 2021-03-04
Data appear reasonable for further use, though with some anomalies still related to the breakouts by age. Metrics per million on a rolling-7 basis are created:
popDataUse <- popData_v2 %>%
filter(state %in% c(state.abb, "DC")) %>%
select(state, contains("_mu")) %>%
pivot_longer(-state) %>%
mutate(ageGroup=stringr::str_replace_all(name, "pop|_mu", "")) %>%
rename(pop=value) %>%
select(state, ageGroup, pop)
vaxDataUse <- vaxImplied_210712_v2 %>%
filter(state %in% c(state.abb, "DC")) %>%
select(state, date, vaxTot=Series_Complete_Yes, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(ageGroup=stringr::str_replace_all(name, "vax", "")) %>%
rename(vax=value) %>%
select(state, date, ageGroup, vax)
popVaxData <- vaxDataUse %>%
inner_join(popDataUse, by=c("state", "ageGroup")) %>%
mutate(vaxpct=vax/pop) %>%
arrange(state, ageGroup, date) %>%
group_by(state, ageGroup) %>%
helperRollingAgg(origVar="vaxpct", newName="vaxpct7") %>%
ungroup()
popVaxData %>%
filter(!is.na(vaxpct7)) %>%
ggplot(aes(x=date, y=vaxpct7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)]), alpha=0.5) +
lims(y=c(0, 1)) +
facet_wrap(~ageGroup) +
labs(title="Percent Fully Vaccinated", x=NULL, y="Rolling 7 'Series Complete' percentage") +
scale_color_discrete("Census\nRegion")
popVaxData %>%
filter(!is.na(vaxpct7)) %>%
ggplot(aes(x=date, y=vaxpct7)) +
geom_line(aes(group=ageGroup, color=ageGroup)) +
lims(y=c(0, 1)) +
facet_wrap(~state) +
labs(title="Percent Fully Vaccinated", x=NULL, y="Rolling 7 'Series Complete' percentage") +
scale_color_discrete("Age")
Next steps are to incorporate these steps as a reproducible function.
The function readQCRawCDCDaily() is copied and applied:
# Function to read and check a raw data file
readQCRawCDCDaily <- function(fileName,
writeLog=NULL,
ovrwriteLog=TRUE,
dfRef=NULL,
urlType=NULL,
url=NULL,
getData=TRUE,
ovrWriteDownload=FALSE,
vecRename=NULL,
selfList=NULL,
fullList=NULL,
uniqueBy=NULL,
step3Group=NULL,
step3Vals=NULL,
step4KeyVars=NULL,
step5PlotItems=NULL,
step6AggregateList=NULL,
inferVars=list("url"=urlMapper,
"vecRename"=renMapper,
"selfList"=selfListMapper,
"fullList"=fullListMapper,
"uniqueBy"=uqMapper,
"step3Group"=checkControlGroupMapper,
"step3Vals"=checkControlVarsMapper,
"step4KeyVars"=checkSimilarityMapper,
"step5PlotItems"=plotSimilarityMapper,
"step6AggregateList"=keyAggMapper
)
) {
# FUNCTION ARGUMENTS
# fileName: the location where downloaded data either is, or will be, stored
# writeLog: the external file location for printing (NULL means use the main log stdout)
# ovrwriteLog: boolean, if using an external log, should it be started from scratch (overwritten)?
# dfRef: a reference data frame for comparison (either NULL or NA means do not run comparisons)
# urlType: character vector that can be mapped using urlMapper and keyVarMapper
# url: direct URL passed as character string
# NOTE that if both url and urlType are NULL, no file will be downloaded
# getData: boolean, should an attempt be made to get new data using urlType or url?
# ovrWriteDownload: boolean, if fileName already exists, should it be overwritten?
# vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
# NULL means infer from urlType, if not available there use c()
# selfList: list for functions to apply to self, list('variable'=fn) will apply variable=fn(variable)
# processed in order, so more than one function can be applied to self
# NULL means infer from urlType, if not available in mapping file use list()
# fullList: list for general functions to be applied, list('new variable'=expression(code))
# will create 'new variable' as eval(expression(code))
# for now, requires passing an expression
# NULL means infer from urlType, use list() if not in mapping file
# uniqueBy: combination of variables for checking uniqueness
# NULL means infer from data, keep as NULL (meaning use-all) if cannot be inferred
# step3Group: variable to be used as the x-axis (grouping) for step 3 plots
# NULL means infer from data
# step3Vals: values to be plotted on the y-axis for step 3 plots
# NULL means infer from data
# step4KeyVars: list of parameters to be passed as keyVars= in step 4
# NULL means infer from urlType
# step5PlotItems: items to be plotted in step 5
# NULL means infer from urlType
# step6AggregateList: drives the elements to be passed to compareAggregate() and flagLargeDelta()
# NULL means infer from urlType
# inferVars: vector of c('variable'='mapper') for inferring parameter values when passed as NULL
# Step 0a: Use urlType to infer key variables if passed as NULL
for (vrbl in names(inferVars)) {
mapper <- inferVars[[vrbl]]
if (is.null(get(vrbl))) {
if (urlType %in% names(mapper)) assign(vrbl, mapper[[urlType]])
else if ("default" %in% names(mapper)) assign(vrbl, mapper[["default"]])
}
}
# Step 1: Download a new file (if requested)
if (!is.null(url) & isTRUE(getData)) fileDownload(fileName=fileName, url=url, ovrWrite=ovrWriteDownload)
else cat("\nNo file has been downloaded, will use existing file:", fileName, "\n")
# Step 2: Read file, rename and mutate variables, confirm uniqueness by expected levels
dfRaw <- fileRead(fileName) %>%
colRenamer(vecRename) %>%
colMutater(selfList=selfList, fullList=fullList) %>%
checkUniqueRows(uniqueBy=uniqueBy)
# Step 3: Plot basic control totals for new cases and new deaths by month
dfRaw %>%
checkControl(groupBy=step3Group, useVars=step3Vals, printControls=FALSE, na.rm=TRUE) %>%
helperLinePlot(x=step3Group, y="newValue", facetVar="name", facetScales="free_y", groupColor="name")
# If there is no file for comparison, return the data
if (is.null(dfRef) | if(length(dfRef)==1) is.na(dfRef) else FALSE) return(dfRaw)
# Step 4b: Check similarity of existing and reference file
# ovrWriteLog=FALSE since everything should be an append after the opening text line in step 0
diffRaw <- checkSimilarity(df=dfRaw,
ref=dfRef,
keyVars=step4KeyVars,
writeLog=writeLog,
ovrwriteLog=FALSE
)
# Step 5: Plot the similarity checks
plotSimilarity(diffRaw, plotItems=step5PlotItems)
# Step 6: Plot and report on differences in aggregates
helperAggMap <- function(x) {
h1 <- compareAggregate(df=dfRaw, ref=dfRef, grpVar=x$grpVar, numVars=x$numVars,
sameUniverse=x$sameUniverse, plotData=x$plotData, isLine=x$isLine,
returnDelta=x$returnDelta)
if (isTRUE(x$flagLargeDelta)) {
h2 <- flagLargeDelta(h1, pctTol=x$pctTol, absTol=x$absTol, sortBy=x$sortBy,
dropNA=x$dropNA, printAll=x$printAll
)
if (is.null(writeLog)) print(h2)
else {
cat(nrow(h2), " records", sep="")
txt <- paste0("\n\n***Differences of at least ",
x$absTol,
" and at least ",
round(100*x$pctTol, 3), "%\n\n"
)
printLog(h2, txt=txt, writeLog=writeLog)
}
}
}
lapply(step6AggregateList, FUN=helperAggMap)
cat("\n\n")
# Return the raw data file
dfRaw
}
# Run without downloading data and without a comparison file
vaxRaw_210712_func <- readQCRawCDCDaily(fileName="./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv",
url="https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD",
getData=FALSE,
vecRename=c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
),
selfList=list("date"=lubridate::mdy),
uniqueBy=c("date", "state"),
step3Group=c("date"),
step3Vals=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
inferVars=list()
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: date state
While there is double-counting due to the “US” record being included, the general process for a basic file read is working as intended. Next steps are to update the process to allow for comparison to an existing file.
The latest vaccines data are downloaded, with results cached:
urlVaccine <- "https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD"
locVaccine <- "./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv"
fileDownload(locVaccine, urlVaccine)
## size isdir mode
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 4406078 FALSE 666
## mtime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:29
## ctime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:26
## atime exe
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:29 no
The function readQCRawCDCDaily() is applied using the previous data as the control:
# Run without downloading data and with a comparison file
vaxRaw_210717_func <- readQCRawCDCDaily(fileName="./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv",
dfRef=vaxRaw_210712_func,
url="https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD",
getData=FALSE,
vecRename=c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
),
selfList=list("date"=lubridate::mdy),
uniqueBy=c("date", "state"),
step3Group=c("date"),
step3Vals=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
step4KeyVars=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
step5PlotItems=c("date"),
step6AggregateList=list("l1"=list("grpVar"="date",
"numVars"=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.01,
"absTol"=1,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l3"=list("grpVar"="state",
"numVars"=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
inferVars=list()
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: date state
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 5
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 1 and at least 1%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## [1] state name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
The function works well for reading a raw vaccines data file, running basic checks, and comparing to a previous vaccines data file. Next steps are to adapt the function for processing a vaccines data file.
The function processRawFile() is leveraged:
# Generic function for processing a raw file
processRawFile <- function(df,
vecRename=c(),
vecSelect=NULL,
lstCombo=list(),
lstFilter=list(),
lstExclude=list()
) {
# FUNCTION ARGUMENTS:
# df: the raw data frame or tibble
# vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
# vecSelect: vector of columns to select (run after vecRename), NULL means select all columns
# lstCombo: a nested list of combinations to be applied
# each element of the list should include comboVar, uqVars, vecCombo, and fn
# lstFilter: a list for filtering records, of form list("field"=c("allowed values"))
# lstExclude: a list for filtering records, of form list("field"=c("disallowed values"))
# STEP 1: Rename and select variables (selection occurs AFTER renaming)
dfProcess <- df %>%
colRenamer(vecRename=vecRename) %>%
colSelector(vecSelect=vecSelect)
# STEP 2: Combine multiple records to a single record
for (ctr in seq_along(lstCombo)) {
dfProcess <- dfProcess %>%
combineRows(comboVar=lstCombo[[ctr]]$comboVar,
uqVars=lstCombo[[ctr]]$uqVars,
vecCombo=lstCombo[[ctr]]$vecCombo,
fn=lstCombo[[ctr]]$fn
)
}
# STEP 3: Filter records
qcOrig <- dfProcess %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>%
mutate(isType="before")
dfProcess <- dfProcess %>%
rowFilter(lstFilter=lstFilter, lstExclude=lstExclude)
# STEP 4: Report on differences
cat("\nColumn sums before and after applying filtering rules:\n")
dfProcess %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>%
mutate(isType="after") %>%
bind_rows(qcOrig) %>%
arrange(desc(isType)) %>%
bind_rows(mutate(summarize(., across(where(is.numeric), function(x) (max(x)-min(x))/max(x))),
isType="pctchg"
)
) %>%
select(isType, everything()) %>%
print()
cat("\n")
# Return the processed data file
dfProcess
}
vaxProc_210717_func <- processRawFile(vaxRaw_210717_func,
vecRename=c(),
vecSelect=c("date", "state", "MMWR_week",
"Administered", "Admin_Per_100k",
"Series_Complete_Yes", "Series_Complete_Pop_Pct",
"Series_Complete_12Plus", "Series_Complete_12PlusPop_Pct",
"Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
"Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct"
),
lstCombo=list(),
lstFilter=list("state"=c(state.abb, "DC")),
lstExclude=list()
)
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 13
## isType MMWR_week Administered Admin_Per_100k Series_Complete~ Series_Complete~
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.49e+5 7.14e+10 628437370 2.87e+10 258606.
## 2 after 1.97e+5 3.39e+10 531817808 1.39e+10 218997.
## 3 pctchg 2.10e-1 5.25e- 1 0.154 5.16e- 1 0.153
## # ... with 7 more variables: Series_Complete_12Plus <dbl>,
## # Series_Complete_12PlusPop_Pct <dbl>, Series_Complete_18Plus <dbl>,
## # Series_Complete_18PlusPop_Pct <dbl>, Series_Complete_65Plus <dbl>,
## # Series_Complete_65PlusPop_Pct <dbl>, n <dbl>
vaxProc_210717_func
## # A tibble: 10,965 x 13
## date state MMWR_week Administered Admin_Per_100k Series_Complete_Yes
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2021-07-16 FL 28 21688774 100983 10167736
## 2 2021-07-16 KS 28 2540782 87213 1249272
## 3 2021-07-16 SC 28 4292812 83376 2045648
## 4 2021-07-16 AR 28 2310080 76548 1062254
## 5 2021-07-16 ND 28 652273 85593 301349
## 6 2021-07-16 MN 28 5979756 106031 2987234
## 7 2021-07-16 DE 28 1065570 109428 501985
## 8 2021-07-16 IA 28 3087945 97872 1543626
## 9 2021-07-16 NV 28 2867707 93103 1330894
## 10 2021-07-16 DC 28 885481 125467 379400
## # ... with 10,955 more rows, and 7 more variables:
## # Series_Complete_Pop_Pct <dbl>, Series_Complete_12Plus <dbl>,
## # Series_Complete_12PlusPop_Pct <dbl>, Series_Complete_18Plus <dbl>,
## # Series_Complete_18PlusPop_Pct <dbl>, Series_Complete_65Plus <dbl>,
## # Series_Complete_65PlusPop_Pct <dbl>
Next steps are to run the per-capita process for conversion of Administered and Series_Complete_Yes based on the same state population data used for cases, deaths, and hospitalizations.
The function createPerCapita() is leveraged:
# Function to extract and format key state data
getStateData <- function(df=readFromRDS("statePop2019"),
renameVars=c("stateAbb"="state", "NAME"="name", "pop_2019"="pop"),
keepVars=c("state", "name", "pop")
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing state data
# renameVars: variables to be renamed, using named list with format "originalName"="newName"
# keepVars: variables to be kept in the final file
# Rename variables where appropriate
names(df) <- ifelse(is.na(renameVars[names(df)]), names(df), renameVars[names(df)])
# Return file with only key variables kept
df %>%
select_at(vars(all_of(keepVars)))
}
useVars <- c("state", "date", "Administered", "Series_Complete_Yes")
vaxPerCap_210717_func <- createPerCapita(select(vaxProc_210717_func, all_of(useVars)),
uqBy=c("state", "date"),
popData=getStateData(),
mapper=c("Administered"="vxapm", "Series_Complete_Yes"="vxcpm"),
)
vaxPerCap_210717_func
## # A tibble: 10,965 x 8
## state date Administered Series_Complete_Yes vxapm vxcpm vxapm7 vxcpm7
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-12-14 0 0 0 0 NA NA
## 2 AL 2020-12-14 0 0 0 0 NA NA
## 3 AR 2020-12-14 0 0 0 0 NA NA
## 4 AZ 2020-12-14 0 0 0 0 NA NA
## 5 CA 2020-12-14 0 0 0 0 NA NA
## 6 CO 2020-12-14 0 0 0 0 NA NA
## 7 CT 2020-12-14 0 0 0 0 NA NA
## 8 DC 2020-12-14 0 0 0 0 NA NA
## 9 DE 2020-12-14 0 0 0 0 NA NA
## 10 FL 2020-12-14 0 0 0 0 NA NA
## # ... with 10,955 more rows
vaxPerCap_210717_func %>%
select(state, date, vxapm7, vxcpm7) %>%
pivot_longer(-c(state, date)) %>%
filter(!is.na(value), name=="vxcpm7") %>%
mutate(region=ifelse(state=="DC", "South Atlantic", as.character(state.division)[match(state, state.abb)])) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=state), alpha=0.25) +
geom_line(data=~summarize(group_by(., region, date), value=median(value), .groups="drop"),
aes(color=region)
) +
facet_wrap(~region) +
lims(y=c(0, 1)) +
labs(x=NULL,
y="Proportion Fully Vaccinated (of total population)",
title="Evolution of fully vaccinated by state and census division",
subtitle="Colored line is median in region, gray line is individual states in region"
) +
theme(legend.position="none")
The createPerCapita() function is updated to allow for keeping variables without calculating per-million or rolling-7 aggregates:
# Generic function to create per-capita metrics using an existing file and source of population data
createPerCapita <- function(lst,
uqBy,
popData,
mapper,
asIsVars=c(),
lstSortBy=uqBy,
fnJoin=dplyr::full_join,
popJoinBy="state",
popVar="pop",
k=7,
mult=1000000,
...
) {
# FUNCTION ARGUMENTS:
# lst: A list containing one or more files to be joined OR a data frame that is already joined
# uqBy: character string that the input file is unique by (will be the join keys if a list is passed)
# popData: file containing population data that can be joined to the processed lst
# mapper: mapping file of c('current name'='per capita name') for mapping variables
# asIsVars: variables to be kept, but without creating pm or pm7
# lstSortBy: the sorting that should be used for creating rolling metrics
# fnJoin: The function to be used for joining files
# popJoinBy: character string for the variable(s) to be used in joining popData to lst
# popVar: character string for the variable in popData that represents population
# k: time perior for rolling aggregations
# mult: the unit for the per-capita data (default 1 million means make metrics per million)
# ...: other arguments to be passed to combineFiles()
# Step 1: If a list has been passed, use a joining process to create a data frame
if ("list" %in% class(lst)) lst <- combineFiles(lst, byVars=uqBy, fn=fnJoin, ...)
# Step 2: Sort the data using sortBy
df <- dplyr::arrange(lst, across(all_of(lstSortBy)))
# Step 3: Check that all variables other than uqBy and asIsVars can be mapped using mapper
keyVars <- setdiff(names(df), c(uqBy, asIsVars))
if (any(isFALSE(keyVars %in% mapper))) stop("\nVariable is missing in per capita mapper file\n")
# Step 4: Run the per capita mapping process
df <- helperMakePerCapita(df,
mapVars=mapper[keyVars],
popData=popData,
k=k,
byVar=popJoinBy,
sortVar=setdiff(lstSortBy, popJoinBy),
popVar=popVar,
mult=mult
)
# Return the data frame
df
}
The updated process is then run, keeping the breakout for 65+ and 18+:
uqVars <- c("state", "date")
perCapVars <- c("Administered", "Series_Complete_Yes")
asIsVars <- c("Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct",
"Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
"Admin_Per_100k", "Series_Complete_Pop_Pct"
)
vaxPerCap_210717_func_v2 <- createPerCapita(select(vaxProc_210717_func, all_of(c(uqVars, perCapVars, asIsVars))),
uqBy=uqVars,
asIsVars=asIsVars,
popData=getStateData(),
mapper=c("Administered"="vxapm", "Series_Complete_Yes"="vxcpm")
) %>%
colRenamer(c("Series_Complete_Yes"="vxc",
"Administered"="vxa",
"Series_Complete_Pop_Pct"="vxcpoppct",
"Series_Complete_65Plus"="vxcgte65",
"Series_Complete_65PlusPop_Pct"="vxcgte65pct",
"Series_Complete_18Plus"="vxcgte18",
"Series_Complete_18PlusPop_Pct"="vxcgte18pct"
)
)
vaxPerCap_210717_func_v2
## # A tibble: 10,965 x 14
## state date vxa vxc vxcgte65 vxcgte65pct vxcgte18 vxcgte18pct
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-12-14 0 0 0 0 0 0
## 2 AL 2020-12-14 0 0 0 0 0 0
## 3 AR 2020-12-14 0 0 0 0 0 0
## 4 AZ 2020-12-14 0 0 0 0 0 0
## 5 CA 2020-12-14 0 0 0 0 0 0
## 6 CO 2020-12-14 0 0 0 0 0 0
## 7 CT 2020-12-14 0 0 0 0 0 0
## 8 DC 2020-12-14 0 0 0 0 0 0
## 9 DE 2020-12-14 0 0 0 0 0 0
## 10 FL 2020-12-14 0 0 0 0 0 0
## # ... with 10,955 more rows, and 6 more variables: Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>
# Check consistency of 'Admin_Per_100k' and 'vxapm'
vaxPerCap_210717_func_v2 %>%
filter(date==max(date)) %>%
ggplot(aes(x=Admin_Per_100k, y=vxapm)) +
geom_text(aes(label=state)) +
geom_abline(slope=10, intercept=0, lty=2) +
labs(x="Raw data administered per 100k",
y="Function-calculated adminsitered per million",
title="Consistency of raw data and function-calculated per capita data",
subtitle="Dotted line is per-million at 10x per-100k (expected)"
)
# Check consistency of 'vxcpoppct' and 'vxcpm'
vaxPerCap_210717_func_v2 %>%
filter(date==max(date)) %>%
ggplot(aes(x=vxcpoppct, y=vxcpm)) +
geom_text(aes(label=state)) +
geom_abline(slope=10000, intercept=0, lty=2) +
labs(x="Raw data percent of population completely vaccinated",
y="Function-calculated completely vaccinated per million",
title="Consistency of raw data and function-calculated per capita data",
subtitle="Dotted line is per-million at 10,000x per-100 (expected)"
)
The raw data and per-capita totals are aligned, suggesting that population estimates used in the datasets are very similar (functions use 2019 estimates as per getStateData()).
The colMutater() function is added to include:
Variables are added as follows:
# Conversions for 18-64 and 0-17
subGroupList <- list("vxc1864"=expression(vxcgte18-vxcgte65),
"vxc0017"=expression(vxc-vxcgte18)
)
# Conversions for per-day
perDayFunc <- function(x) ifelse(row_number()==1, x, ifelse(lag(x)==0, 0, x-lag(x)))
perDayList <- list("vxa_perday"=expression(perDayFunc(vxa)),
"vxc_perday"=expression(perDayFunc(vxc)),
"vxcgte65_perday"=expression(perDayFunc(vxcgte65)),
"vxc1864_perday"=expression(perDayFunc(vxc1864)),
"vxc0017_perday"=expression(perDayFunc(vxc0017))
)
vaxPerCap_210717_func_v3 <- vaxPerCap_210717_func_v2 %>%
colMutater(fullList=subGroupList) %>%
arrange(date, state) %>%
group_by(state) %>%
colMutater(fullList=perDayList) %>%
ungroup()
# Check that files are identical for same variables
sapply(names(vaxPerCap_210717_func_v2),
FUN=function(x) all.equal(vaxPerCap_210717_func_v2[[x]], vaxPerCap_210717_func_v3[[x]])
) %>%
t() %>%
t()
## [,1]
## state TRUE
## date TRUE
## vxa TRUE
## vxc TRUE
## vxcgte65 TRUE
## vxcgte65pct TRUE
## vxcgte18 TRUE
## vxcgte18pct TRUE
## Admin_Per_100k TRUE
## vxcpoppct TRUE
## vxapm TRUE
## vxcpm TRUE
## vxapm7 TRUE
## vxcpm7 TRUE
# Plot evolution of vaccines by age
vaxPerCap_210717_func_v3 %>%
select(date, vxc, vxcgte65, vxc1864, vxc0017) %>%
group_by(date) %>%
summarize(across(.fns=sum)) %>%
pivot_longer(-date) %>%
ggplot(aes(x=date)) +
geom_point(data=~filter(., name=="vxc"), aes(y=value/1000000)) +
geom_col(data=~filter(., name!="vxc"), aes(y=value/1000000, fill=name), position="stack") +
labs(x=NULL,
y="Completely Vaccinated (millions)",
title="Evolution of fully vaccinated by age group",
subtitle="Dots are total people fully vaccinated"
) +
scale_fill_discrete("Age")
# Plot evolution of vaccines administered per day
vaxPerCap_210717_func_v3 %>%
select(date, vxa, vxa_perday) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
pivot_longer(-date) %>%
group_by(name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(aes(y=value/1000000)) +
geom_line(data=~filter(., name=="vxa_perday", !is.na(value7)), aes(y=value7/1000000), color="red", lwd=2) +
facet_wrap(~c("vxa"="Cumulative", "vxa_perday"="Daily")[name], scales="free_y") +
labs(x=NULL,
y="Vaccines Adminsitered (millions)",
title="Evolution of vaccines administered",
subtitle="Red line is rolling 7-day average"
)
Variables appear to be created as intended.
Next, total population is estimated and plots of vaccines administered per capita are created:
# Plot evolution of vaccines administered per day
vaxPerCap_210717_func_v3 %>%
select(state, date, vxa, vxa_perday, Admin_Per_100k) %>%
group_by(state) %>%
mutate(pop=median(100000*vxa/Admin_Per_100k, na.rm=TRUE)) %>%
ungroup() %>%
pivot_longer(-c(state, date, pop)) %>%
group_by(state, pop, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(data=~filter(., name=="vxa_perday", !is.na(value7), state != "NM"),
aes(y=1000*value7/pop, group=state)
) +
facet_wrap(~state) +
labs(x=NULL,
y="Vaccines Adminsitered (per thousand)",
title="Evolution of vaccines administered (rolling 7-day average)"
)
# Plot evolution of vaccines administered (cumulative)
vaxPerCap_210717_func_v3 %>%
select(state, date, vxa, vxa_perday, Admin_Per_100k) %>%
group_by(state) %>%
mutate(pop=median(100000*vxa/Admin_Per_100k, na.rm=TRUE)) %>%
ungroup() %>%
pivot_longer(-c(state, date, pop)) %>%
group_by(state, pop, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(data=~filter(., name=="vxa", !is.na(value)),
aes(y=1000*value/pop, group=state)
) +
geom_hline(yintercept=1000, lty=2) +
facet_wrap(~state) +
labs(x=NULL,
y="Vaccines Adminsitered (per thousand)",
title="Evolution of vaccines administered (cumulative)"
)
Estimates are made for population 65+, 18-64, and 0-17 based on completion percentages:
popEstAgeState <- vaxPerCap_210717_func_v3 %>%
select(state, date, vxcgte65, vxcgte65pct, vxcgte18, vxcgte18pct, vxc, vxcpoppct) %>%
mutate(popgte65=100*vxcgte65/vxcgte65pct, popgte18=100*vxcgte18/vxcgte18pct, pop=100*vxc/vxcpoppct) %>%
group_by(state) %>%
summarize(across(c(popgte65, popgte18, pop), median, na.rm=TRUE), .groups="drop") %>%
mutate(pop1864=popgte18-popgte65, pop0017=pop-popgte18)
popEstAgeState
## # A tibble: 51 x 6
## state popgte65 popgte18 pop pop1864 pop0017
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 91583. 551552. 731518. 459969. 179966.
## 2 AL 849784. 3815294. 4903279. 2965510. 1087985.
## 3 AR 523915. 2317203. 3016324. 1793288. 699121.
## 4 AZ 1308603. 5637959. 7278853. 4329356. 1640894.
## 5 CA 5838155. 30623548. 39513431. 24785393. 8889884.
## 6 CO 842448. 4499815. 5758852. 3657367. 1259037.
## 7 CT 630249. 2837921. 3565957. 2207671. 728037.
## 8 DC 87347. 577646. 705734. 490299. 128088.
## 9 DE 188903. 770105. 973906. 581202. 203801.
## 10 FL 4497505. 17248010. 21479412. 12750505. 4231402.
## # ... with 41 more rows
popEstAgeState %>%
summarize(across(where(is.numeric), sum))
## # A tibble: 1 x 5
## popgte65 popgte18 pop pop1864 pop0017
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 54058078. 255206800. 328239240. 201148722. 73032440.
popEstAgeState %>%
select(-popgte18, -pop) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pctg65=ifelse(name=="popgte65", value, 0)/sum(value)) %>%
ggplot(aes(x=fct_reorder(state, pctg65, .fun=max), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
labs(x=NULL,
y="Proportion",
title="Distribution of population by state",
subtitle="Estimated from reported vaccine completion percentages by sub-group"
) +
scale_fill_discrete("Age Group")
Metrics for evolution of complete vaccination by age cohort are calculated and plotted:
popState <- popEstAgeState %>%
select(state, popgte65, pop1864, pop0017) %>%
pivot_longer(-state) %>%
mutate(age=str_replace_all(name, "pop", "")) %>%
select(state, age, pop=value)
popState
## # A tibble: 153 x 3
## state age pop
## <chr> <chr> <dbl>
## 1 AK gte65 91583.
## 2 AK 1864 459969.
## 3 AK 0017 179966.
## 4 AL gte65 849784.
## 5 AL 1864 2965510.
## 6 AL 0017 1087985.
## 7 AR gte65 523915.
## 8 AR 1864 1793288.
## 9 AR 0017 699121.
## 10 AZ gte65 1308603.
## # ... with 143 more rows
vaxPerCap_210717_func_v3 %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
There are discontinuities in the data, particularly for the age group 0-17 bucket. This is likely driven by differences in timing of age breakouts based on authorizations for vaccines by age cohort. Next steps are to investigate and correct for discontinuities, particularly those showing negative completed vaccinations.
The big spike then decline in 0017 is investigated:
deltaData_210717 <- vaxPerCap_210717_func_v3 %>%
select(state, date, vxc, vxcgte65, vxcgte18) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
mutate(across(where(is.numeric), .fn=function(x) ifelse(lag(x)==0, NA, x - lag(x)), .names="d_{.col}"))
deltaData_210717 %>%
select(date, starts_with("d")) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
deltaData_210717 %>%
filter(d_vxc < d_vxcgte18)
## # A tibble: 1 x 7
## date vxc vxcgte65 vxcgte18 d_vxc d_vxcgte65 d_vxcgte18
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-04-06 62035890 29998523 61906243 612945 318833 2924665
deltaData_210717 %>%
filter(date %in% (as.Date("2021-04-01")+0:10))
## # A tibble: 11 x 7
## date vxc vxcgte65 vxcgte18 d_vxc d_vxcgte65 d_vxcgte18
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-04-01 55181012 28135317 55083159 1461437 688366 1454688
## 2 2021-04-02 57047089 28913986 56942235 1866077 778669 1859076
## 3 2021-04-03 58902844 29557456 58696677 1855755 643470 1754442
## 4 2021-04-04 60453008 29624425 58857402 1550164 66969 160725
## 5 2021-04-05 61422945 29679690 58981578 969937 55265 124176
## 6 2021-04-06 62035890 29998523 61906243 612945 318833 2924665
## 7 2021-04-07 63419456 30411378 63284674 1383566 412855 1378431
## 8 2021-04-08 65178735 30843713 65034300 1759279 432335 1749626
## 9 2021-04-09 67158210 31349712 67001971 1979475 505999 1967671
## 10 2021-04-10 69617607 31989344 69447387 2459397 639632 2445416
## 11 2021-04-11 71525283 32417679 71342554 1907676 428335 1895167
There is a situation on April 4-6 where age breakouts for vaccination increases do not make sense. An assumption is made for April 4-6 that d_vxcgte18 will be set to d_vxc for each state:
# Conversions for per-day
perDayFunc <- function(x) ifelse(row_number()==1, x, ifelse(lag(x)==0, 0, x-lag(x)))
perDayList <- list("vxa_perday"=expression(perDayFunc(vxa)),
"vxc_perday"=expression(perDayFunc(vxc)),
"vxcgte65_perday"=expression(perDayFunc(vxcgte65)),
"vxc1864_perday"=expression(perDayFunc(vxc1864)),
"vxc0017_perday"=expression(perDayFunc(vxc0017))
)
vaxPerCap_210717_func_v4 <- vaxPerCap_210717_func_v2 %>%
arrange(state, date) %>%
group_by(state) %>%
mutate(vxcgte18=ifelse(date %in% c(as.Date("2021-04-04")), lag(vxcgte18) + vxc - lag(vxc), vxcgte18)) %>%
mutate(vxcgte18=ifelse(date %in% c(as.Date("2021-04-05")), lag(vxcgte18) + vxc - lag(vxc), vxcgte18)) %>%
mutate(vxcgte18=ifelse(date %in% c(as.Date("2021-04-06")), lag(vxcgte18) + vxc - lag(vxc), vxcgte18)) %>%
ungroup() %>%
colMutater(fullList=subGroupList) %>%
arrange(date, state) %>%
group_by(state) %>%
colMutater(fullList=perDayList) %>%
ungroup()
all.equal(vaxPerCap_210717_func_v4, vaxPerCap_210717_func_v3)
## [1] "Component \"vxcgte18\": Mean relative difference: 0.02034966"
## [2] "Component \"vxc1864\": Mean relative difference: 0.03968432"
## [3] "Component \"vxc0017\": Mean relative difference: 6.03064"
## [4] "Component \"vxc1864_perday\": Mean relative difference: 1.011459"
## [5] "Component \"vxc0017_perday\": Mean relative difference: 40.83178"
all.equal(vaxPerCap_210717_func_v4[!(vaxPerCap_210717_func_v3$date %in% c(as.Date("2021-04-04")+0:3)), ],
vaxPerCap_210717_func_v3[!(vaxPerCap_210717_func_v3$date %in% c(as.Date("2021-04-04")+0:3)), ]
)
## [1] TRUE
deltaData_210717_v4 <- vaxPerCap_210717_func_v4 %>%
select(state, date, vxc, vxcgte65, vxcgte18) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
mutate(across(where(is.numeric), .fn=function(x) ifelse(lag(x)==0, NA, x - lag(x)), .names="d_{.col}"))
deltaData_210717_v4 %>%
select(date, starts_with("d")) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
deltaData_210717_v4 %>%
filter(d_vxc < d_vxcgte18)
## # A tibble: 1 x 7
## date vxc vxcgte65 vxcgte18 d_vxc d_vxcgte65 d_vxcgte18
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-04-07 63419456 30411378 63284674 1383566 412855 1454951
vaxPerCap_210717_func_v4 %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
The data appear improved, though there are still a handful of states with negative vaccinations completed on given days. This is an area for further investigation.
A function is written to update key variables by day:
updateByDay <- function(df,
dateStart,
dateEnd=NULL,
nDates=NULL,
varGroup=c("state"),
varSort=c("date"),
exprList=list()
) {
# FUNCTION ARGUMENTS
# df: the data frame to be processed
# Convert dateStart to date if not already in that format
if ("character" %in% class(dateStart)) dateStart <- as.Date(dateStart)
if (!("Date") %in% class(dateStart)) stop("\nArgument dateStart must be a Date object or character YYYY-MM-DD\n")
# Create dateEnd from nDates if needed
if (is.null(dateEnd)) {
if (is.null(nDates)) stop("\nMust pass either dateEnd or nDates\n")
dateEnd <- dateStart + nDates - 1
}
if ("character" %in% class(dateEnd)) dateEnd <- as.Date(dateEnd)
if (!("Date") %in% class(dateEnd)) stop("\nArgument dateEnd must be a Date object or character YYYY-MM-DD\n")
# Declare the dates to be investigated
keyDates <- seq.Date(from=dateStart, to=dateEnd, by=1)
cat("\nData will be modified as needed for dates:", keyDates, "\n")
# Arrange and group the data as requested
df <- df %>%
arrange(across(all_of(c(varGroup, varSort)))) %>%
group_by(across(all_of(varGroup)))
# Make updates for each of the keyDates
for (keyDate in keyDates) {
df <- df %>%
mutate(modThis=(date %in% keyDate)) %>%
colMutater(fullList=exprList) %>%
select(-modThis)
}
df
}
modList <- list("vxcgte18"=expression(ifelse(modThis, lag(vxcgte18) + vxc - lag(vxc), vxcgte18)))
vaxPerCap_210717_func_v4 <- updateByDay(vaxPerCap_210717_func_v2,
dateStart="2021-04-03",
dateEnd="2021-04-06",
exprList=modList
) %>%
colMutater(fullList=subGroupList) %>%
colMutater(fullList=perDayList) %>%
ungroup()
##
## Data will be modified as needed for dates: 18720 18721 18722 18723
vaxPerCap_210717_func_v4 %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
vaxPerCap_210717_func_v4 %>%
filter(vxcgte65_perday < 0) %>%
arrange(vxcgte65_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 36 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TX 2021-04-06 -567992 693861 0 1.33e7 4.80e6
## 2 TX 2021-05-13 -135100 227765 3424 2.07e7 9.06e6
## 3 TX 2021-05-06 -115815 230012 3647 1.97e7 8.37e6
## 4 TX 2021-04-29 -112167 250804 3933 1.86e7 7.65e6
## 5 TX 2021-05-25 -73677 113485 9227 2.24e7 9.97e6
## 6 TX 2021-06-21 -57128 81490 6487 2.50e7 1.15e7
## 7 TX 2021-06-14 -55998 86975 6021 2.43e7 1.10e7
## 8 TX 2021-05-18 -55966 97475 7649 2.14e7 9.51e6
## 9 TX 2021-06-08 -48890 77092 10403 2.36e7 1.07e7
## 10 TX 2021-06-28 -39123 73748 5748 2.56e7 1.19e7
## # ... with 26 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4 %>%
filter(vxc1864_perday < 0) %>%
arrange(vxc1864_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 47 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NJ 2021-06-25 -26417 -98167 -1631 9.67e6 4.78e6
## 2 TX 2021-04-19 159499 -60513 4017 1.67e7 6.56e6
## 3 FL 2021-04-06 69450 -44626 0 1.06e7 3.88e6
## 4 MI 2021-04-06 39659 -28104 0 4.94e6 1.95e6
## 5 NY 2021-04-06 85828 -25877 0 1.08e7 4.15e6
## 6 IL 2021-04-06 36950 -25200 0 6.63e6 2.36e6
## 7 OH 2021-04-06 40886 -24029 0 5.94e6 2.25e6
## 8 OR 2021-04-06 34523 -23691 0 2.09e6 8.12e5
## 9 MO 2021-04-06 23747 -20457 0 2.86e6 1.08e6
## 10 PA 2021-04-06 44807 -20157 0 6.80e6 2.45e6
## # ... with 37 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4 %>%
filter(vxc0017_perday < 0) %>%
arrange(vxc0017_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 6 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 WV 2021-05-15 1109 4234 -2298 1248326 587339
## 2 NJ 2021-06-25 -26417 -98167 -1631 9667905 4782589
## 3 NH 2021-04-07 4795 5910 -1468 829371 282496
## 4 DC 2021-04-14 501 3176 -68 471911 147799
## 5 AR 2021-06-11 -4 21 -1 2130617 969875
## 6 ND 2021-04-20 18 48 -1 527930 226960
## # ... with 14 more variables: vxcgte65 <dbl>, vxcgte65pct <dbl>,
## # vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>, vxcpoppct <dbl>,
## # vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>, vxc1864 <dbl>,
## # vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
The data continue to be in better shape, with rolling-7 smoothing out many of the negative/positive swings. Next steps are to further investigate Texas (65+) and the April 6 data for 65+ and 18-64. It appears that most of the 65+ data is a bolus on April 6, which may need to be smoothed backwards.
Function updateByDay() is updated to allow for subsetting by specific states:
updateByDay <- function(df,
dateStart,
dateEnd=NULL,
nDates=NULL,
allDates=FALSE,
updateStates=NULL,
varGroup=c("state"),
varSort=c("date"),
exprList=list()
) {
# FUNCTION ARGUMENTS
# df: the data frame to be processed
# dateStart: the starting date for the changes
# dateEnd: the ending date for the changes (NULL means infer from dateStart and nDates)
# nDates: the number of days to include (if not NULL, dateEnd is set to dateStart + nDates - 1)
# allDates: boolean, if TRUE run the function for all dates at once rather than date by date
# updateStates: the states to be updated (NULL means all)
# varGroup: data should be grouped by this variable
# exprList: expression list for variable changes, passed as fullList to colMutater()
# Convert dateStart to date if not already in that format
if ("character" %in% class(dateStart)) dateStart <- as.Date(dateStart)
if (!("Date") %in% class(dateStart)) stop("\nArgument dateStart must be a Date object or character YYYY-MM-DD\n")
# Create dateEnd from nDates if needed
if (is.null(dateEnd) & !isTRUE(allDates)) {
if (is.null(nDates)) stop("\nMust pass either dateEnd or nDates or specify allDates=TRUE\n")
dateEnd <- dateStart + nDates - 1
}
if ("character" %in% class(dateEnd)) dateEnd <- as.Date(dateEnd)
if (!("Date") %in% class(dateEnd) & !isTRUE(allDates))
stop("\nArgument allDates must be TRUE or dateEnd must be a Date object or character YYYY-MM-DD\n")
# Declare the dates to be investigated
if (!isTRUE(allDates)) {
keyDates <- seq.Date(from=dateStart, to=dateEnd, by=1)
cat("\nData will be modified as needed for dates:", keyDates, "\n")
}
# Set the key states to be investigated if passed as NULL
if (is.null(updateStates)) updateStates <- df %>% pull(state) %>% unique() %>% sort()
# Arrange and group the data as requested
df <- df %>%
arrange(across(all_of(c(varGroup, varSort)))) %>%
group_by(across(all_of(varGroup)))
# Make updates for each of the keyDates
if (!isTRUE(allDates)) {
for (keyDate in keyDates) {
df <- df %>%
mutate(modThis=(date %in% keyDate) & (state %in% updateStates)) %>%
colMutater(fullList=exprList) %>%
select(-modThis)
}
} else {
df <- df %>%
mutate(modThis=state %in% updateStates) %>%
colMutater(fullList=exprList) %>%
select(-modThis)
}
df
}
fullModList <- list("vxcgte18"=expression(ifelse(modThis, lag(vxcgte18) + vxc - lag(vxc), vxcgte18)))
txModList <- list("vxc"=expression(ifelse(modThis, zoo::rollmean(vxc, k=7, fill=NA), vxc)),
"vxcgte18"=expression(ifelse(modThis, zoo::rollmean(vxcgte18, k=7, fill=NA), vxcgte18)),
"vxcgte65"=expression(ifelse(modThis, zoo::rollmean(vxcgte65, k=7, fill=NA), vxcgte65))
)
vaxPerCap_210717_func_v4_new <- updateByDay(vaxPerCap_210717_func_v2,
dateStart="2021-04-03",
dateEnd="2021-04-06",
exprList=fullModList
) %>%
updateByDay(dateStart="2020-01-01", allDates=TRUE, updateStates=c("TX"), exprList=txModList) %>%
colMutater(fullList=subGroupList) %>%
colMutater(fullList=perDayList) %>%
ungroup()
##
## Data will be modified as needed for dates: 18720 18721 18722 18723
vaxPerCap_210717_func_v4_new %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
vaxPerCap_210717_func_v4_new %>%
filter(vxcgte65_perday < 0) %>%
arrange(vxcgte65_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 32 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TX 2021-04-06 -46105. 176079. 2119. 1.33e7 4.82e6
## 2 TX 2021-04-09 -45525. 197402. 3544. 1.42e7 5.28e6
## 3 TX 2021-04-08 -43464. 195529. 3102 1.38e7 5.12e6
## 4 TX 2021-04-07 -42975. 189957. 2673. 1.35e7 4.97e6
## 5 TX 2021-04-05 -42769. 171650. 1931. 1.31e7 4.69e6
## 6 NJ 2021-06-25 -26417 -98167 -1631 9.67e6 4.78e6
## 7 TX 2021-04-04 -23817. 156215. 1826. 1.28e7 4.56e6
## 8 TX 2021-04-03 -22253. 154919. 686. 1.26e7 4.42e6
## 9 PA 2021-03-12 -14990 21557 1007 3.76e6 1.21e6
## 10 TX 2021-05-16 -8737. 88786. 8991. 2.12e7 9.34e6
## # ... with 22 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4_new %>%
filter(vxc1864_perday < 0) %>%
arrange(vxc1864_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 45 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NJ 2021-06-25 -26417 -98167 -1631 9.67e6 4.78e6
## 2 FL 2021-04-06 69450 -44626 0 1.06e7 3.88e6
## 3 MI 2021-04-06 39659 -28104 0 4.94e6 1.95e6
## 4 NY 2021-04-06 85828 -25877 0 1.08e7 4.15e6
## 5 IL 2021-04-06 36950 -25200 0 6.63e6 2.36e6
## 6 OH 2021-04-06 40886 -24029 0 5.94e6 2.25e6
## 7 OR 2021-04-06 34523 -23691 0 2.09e6 8.12e5
## 8 MO 2021-04-06 23747 -20457 0 2.86e6 1.08e6
## 9 PA 2021-04-06 44807 -20157 0 6.80e6 2.45e6
## 10 WA 2021-04-06 30652 -19542 0 4.04e6 1.59e6
## # ... with 35 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4_new %>%
filter(vxc0017_perday < 0) %>%
arrange(vxc0017_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 6 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 WV 2021-05-15 1109 4234 -2298 1248326 587339
## 2 NJ 2021-06-25 -26417 -98167 -1631 9667905 4782589
## 3 NH 2021-04-07 4795 5910 -1468 829371 282496
## 4 DC 2021-04-14 501 3176 -68 471911 147799
## 5 AR 2021-06-11 -4 21 -1 2130617 969875
## 6 ND 2021-04-20 18 48 -1 527930 226960
## # ... with 14 more variables: vxcgte65 <dbl>, vxcgte65pct <dbl>,
## # vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>, vxcpoppct <dbl>,
## # vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>, vxc1864 <dbl>,
## # vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4_new %>%
filter(state=="TX") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name)) +
facet_wrap(~name, scales="free_y")
vaxPerCap_210717_func_v4_new %>%
filter(state=="TX") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
select(date, starts_with("d_")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name))
vaxPerCap_210717_func_v4_new %>%
filter(state=="FL") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name)) +
facet_wrap(~name, scales="free_y")
vaxPerCap_210717_func_v4_new %>%
filter(state=="FL") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
select(date, starts_with("d_")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name))
There is still work to do with the timing by age cohort, particularly around April 6. The data may be sufficiently robust “as is” for including with other metrics. The function readRunCDCDaily() is updated, along with the appropriate mapping files:
# Function to download/load, process, segment, and analyze data for CDC daily
readRunCDCDaily <- function(thruLabel,
downloadTo=list("cdcDaily"=NA, "cdcHosp"=NA, "vax"=NA),
readFrom=downloadTo,
compareFile=list("cdcDaily"=NA, "cdcHosp"=NA, "vax"=NA),
writeLog=NULL,
ovrwriteLog=TRUE,
dfPerCapita=NULL,
useClusters=NULL,
hierarchical=TRUE,
returnList=!isTRUE(hierarchical),
kCut=6,
reAssignState=vector("list", 0),
skipAssessmentPlots=FALSE,
brewPalette=NA,
...
) {
# FUNCTION ARGUMENTS:
# thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
# donwloadTo: named list for locations to download data (cdcDaily and cdcHosp)
# NA means do not download data for that particular element
# readFrom: named list for locations to read data from (defaults to donwloadTo)
# compareFile: named list for the reference file to be used for cdcDaily and cdcHosp
# NA means do not use a reference file for that element
# dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
# dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
# writeLog: name of a separate log file for capturing detailed data on changes between files
# NULL means no detailed data captured
# ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
# dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
# default NULL means create dfPerCapita using steps 2-4
# useClusters: file containing clusters by state (NULL means make the clusters from the data)
# hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
# returnList: boolean, should a list be returned or just the cluster object?
# refers to what is returned by clusterStates(); the main function always returns a list
# kCut: number of segments when cutting the hierarchical tree
# reAssignState: mapping file for assigning a state to another state's cluster
# format list("stateToChange"="stateClusterToAssign")
# skipAssessmentPlots: boolean to skip the plots for assessClusters()
# especially useful if just exploring dendrograms or silhouette widths
# brewPalette: create plots using this color scheme (needs to be valid in ggplot2::scale_*_brewer())
# NA means use R default color schemes
# ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
# STEP 0: Function to create the return list
createFinalList <- function(plots=TRUE) {
list(stateData=stateData,
dfRaw=dfRawList,
dfProcess=dfProcessList,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
plotDataList=if(plots) plotDataList else NULL
)
}
# STEP 1: Get state data
stateData <- getStateData()
# If a log file is requested, create the log file (allows for append=TRUE for all downstream functions)
if (!is.null(writeLog)) genNewLog(writeLog=writeLog, ovrwriteLog=ovrwriteLog)
# Get the data types to be used (will be the elements of readFrom) and create a file storage list
elemUsed <- names(readFrom)
dfRawList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
dfProcessList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
# Steps 2-4 are run only is dfPerCapita has not been passed
if (is.null(dfPerCapita)) {
# Step 2: Download and QC all of the requested data
for (elem in elemUsed) {
dfRawList[[elem]] <- readQCRawCDCDaily(fileName=readFrom[[elem]],
writeLog=writeLog,
ovrwriteLog=FALSE,
urlType=elem,
getData=if(is.na(downloadTo[[elem]])) FALSE else TRUE,
dfRef=compareFile[[elem]]
)
glimpseLog(dfRawList[[elem]], txt=paste0("\nRaw file for ", elem, ":\n"), logFile=writeLog)
}
# Step 3: Process all of the requested data
for (elem in elemUsed) {
dfProcessList[[elem]] <- processRawFile(dfRawList[[elem]],
vecRename=c(), # already handled in readQCRawCDCDaily()
vecSelect=vecSelectMapper[[elem]],
lstCombo=lstComboMapper[[elem]],
lstFilter=lstFilterMapper[[elem]]
)
glimpseLog(dfProcessList[[elem]], txt=paste0("\nProcessed for ", elem, ":\n"), logFile=writeLog)
}
# Step 4: Integrate in to a dfPerCapita file and glimpse (to specified log file)
dfPerCapita <- createPerCapita(dfProcessList,
uqBy=c("state", "date"),
popData=stateData,
mapper=perCapMapper,
asIsVars=if(isTRUE(exists("asIsMapper"))) asIsMapper[[elem]] else c()
)
glimpseLog(dfPerCapita, txt="\nIntegrated per capita data file:\n", logFile=writeLog)
} else {
dfRawList <- NULL
dfProcessList <- NULL
}
# STEP 5: Create the clusters (if they have not been passed)
if (is.null(useClusters)) {
clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
useClusters <- getClusters(clData, hier=hierarchical, kCut=kCut, reAssign=reAssignState)
}
# STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
if (skipAssessmentPlots)
return(createFinalList(plots=FALSE))
# STEP 6: Create the cluster assessments
lstFuns <- list("stateData"=function(x) colSelector(x, vecSelect=c("state", "pop")),
"dfPerCapita"=NULL
)
plotDataList <- diagnoseClusters(lst=list("stateData"=stateData,
"dfPerCapita"=dfPerCapita,
"useClusters"=useClusters
),
lstExtract=lstFuns,
brewPalette=brewPalette
)
# STEP 7: Return a list of the key data
return(createFinalList(plots=TRUE))
}
# Mapping list for combining data elements from raw files
# Formatted as one list per urlType, with that list having one list for each combination of data
lstComboMapper <- list("cdcDaily"=list("nyc"=list("comboVar"="state",
"uqVars"="date",
"vecCombo"=c("NY"="NY", "NYC"="NY"),
"fn"=specNA(sum)
)
),
"cdcHosp"=list(),
"vax"=list()
)
# Mapping for urlType to url
urlMapper <- c("cdcDaily"="https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD",
"cdcHosp"="https://beta.healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD",
"vax"="https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD"
)
# Mapping for urlType to colRenamer(vecRename=...)
renMapper <- list("cdcDaily"=c('submission_date'='date', 'new_case'='new_cases',
'tot_death'='tot_deaths', 'new_death'='new_deaths'
),
"cdcHosp"=c("inpatient_beds_used_covid"="inp",
"total_adult_patients_hospitalized_confirmed_and_suspected_covid"="hosp_adult",
"total_pediatric_patients_hospitalized_confirmed_and_suspected_covid"="hosp_ped"
),
"vax"=c("Location"="state",
"Date"="date",
"Series_Complete_Yes"="vxc",
"Administered"="vxa",
"Admin_Per_100K"="Admin_Per_100k",
"Series_Complete_Pop_Pct"="vxcpoppct",
"Series_Complete_65Plus"="vxcgte65",
"Series_Complete_65PlusPop_Pct"="vxcgte65pct",
"Series_Complete_18Plus"="vxcgte18",
"Series_Complete_18PlusPop_Pct"="vxcgte18pct"
),
"default"=c()
)
# Mapping for urlType to colMutater(selfList=...)
selfListMapper <- list("cdcDaily"=list('date'=lubridate::mdy),
"cdcHosp"=list(),
"vax"=list("date"=lubridate::mdy),
"default"=list()
)
# Mapping for urlType to colMutater(fullList=...)
fullListMapper <- list("cdcDaily"=list(),
"cdcHosp"=list(),
"vax"=list(),
"default"=list()
)
# Mapping for urlType to checkUniqueRows(uniqueBy=...)
uqMapper <- list("cdcDaily"=c("state", "date"),
"cdcHosp"=c("state", "date"),
"vax"=c("state", "date")
)
# Mapping list for rows to be filtered (typically, states to be kept)
# Formatted as named list per urlType, with name being the field and element being the allowed values
lstFilterMapper <- list("cdcDaily"=list("state"=c(state.abb, "DC")),
"cdcHosp"=list("state"=c(state.abb, "DC")),
"vax"=list("state"=c(state.abb, "DC"))
)
# Mapping list for vector selection in processed data
# Formatted as a named list where the names are urlType and the values are fields to be kept
vecSelectMapper <- list("cdcDaily"=c("date", "state", "tot_cases", "tot_deaths", "new_cases", "new_deaths"),
"cdcHosp"=c("date", "state", "inp", "hosp_adult", "hosp_ped"),
"vax"=c("date", "state", "vxa",
"vxc", "vxcpoppct",
"vxcgte65", "vxcgte65pct",
"vxcgte18", "vxcgte18pct"
)
)
# Mapping file for group_by variable per urlType
checkControlGroupMapper <- list("cdcDaily"="date",
"cdcHosp"="date",
"vax"="date",
"default"=c()
)
# Mapping file for numerics to summarize by group_by variable per urlType
checkControlVarsMapper <- list("cdcDaily"=c("new_cases", "new_deaths"),
"cdcHosp"=c("inp", "hosp_adult", "hosp_ped"),
"vax"=c("vxa", "vxc", "vxcgte65", "vxcgte18")
)
# Mapping for urlType to checkSimilarity(..., keyVars=); universe similarity checks to perform and report
checkSimilarityMapper <- list("cdcDaily"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
"cdcHosp"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
"vax"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
"default"=list()
)
# Mapping for urlType to plotSimilarity(..., ); fields where change in universe should be reported
plotSimilarityMapper <- list("cdcDaily"=c("date"),
"cdcHosp"=c("date"),
"vax"=c("date"),
"default"=c()
)
# Mapping file for aggregated control total checks to perform
# Formatted as one list per urlType
# Within each urlType list, sublists drive the grouping variable, numerical aggregates, and reporting
keyAggMapper <- list("cdcDaily"=list("l1"=list("grpVar"="date",
"numVars"=c("new_cases", "new_deaths",
"tot_cases", "tot_deaths"
),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.05,
"absTol"=5,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l2"=list("grpVar"="state",
"numVars"=c("new_cases", "new_deaths"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=FALSE,
"flagLargeDelta"=FALSE
),
"l3"=list("grpVar"="state",
"numVars"=c("new_cases", "new_deaths",
"tot_cases", "tot_deaths"
),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
"cdcHosp"=list("l1"=list("grpVar"="date",
"numVars"=c("inp", "hosp_adult", "hosp_ped"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.05,
"absTol"=5,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l2"=list("grpVar"="state",
"numVars"=c("inp", "hosp_adult", "hosp_ped"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=FALSE,
"flagLargeDelta"=FALSE
),
"l3"=list("grpVar"="state",
"numVars"=c("inp", "hosp_adult", "hosp_ped"),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
"vax"=list("l1"=list("grpVar"="date",
"numVars"=c("vxa", "vxc", "vxcgte18", "vxcgte65"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.01,
"absTol"=1,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l3"=list("grpVar"="state",
"numVars"=c("vxa", "vxc", "vxcgte18", "vxcgte65"),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
)
)
# Mapping file for variables to be kept "as is" (not converted to rolling-7 per million)
asIsMapper <- list("cdcDaily"=c(),
"cdcHosp"=c(),
"vax"=c("vxcpoppct", "vxcgte65", "vxcgte65pct", "vxcgte18", "vxcgte18pct")
)
# Mapping file for creating per-capita metrics
# Formatted as c('raw variable name'='associated per capita variable name')
perCapMapper <- c("tot_cases"="tcpm",
"tot_deaths"="tdpm",
"new_cases"="cpm",
"new_deaths"="dpm",
"inp"="hpm",
"hosp_adult"="ahpm",
"hosp_ped"="phpm",
"vxa"="vxapm",
"vxc"="vxcpm"
)
The function is run solely for the vaccines data:
readList <- list("vax"="./RInputFiles/Coronavirus/vaxData_downloaded_210728.csv")
compareList <- list("vax"=vaxRaw_210717_func %>%
colRenamer(vecRename=c("Administered"="vxa",
"Series_Complete_Yes"="vxc",
"Series_Complete_18Plus"="vxcgte18",
"Series_Complete_65Plus"="vxcgte65"
)
)
)
cdc_daily_210728_vaxonly <- readRunCDCDaily(thruLabel="Jul 28, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog=NULL,
useClusters=readFromRDS("cdc_daily_210528")$useClusters,
skipAssessmentPlots=TRUE
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/vaxData_downloaded_210728.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current: Series_Complete_Pop_Pct Series_Complete_18PlusPop_Pct Series_Complete_65PlusPop_Pct
## In current but not in reference: vxcpoppct vxcgte18pct vxcgte65pct
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 11
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 1 and at least 1%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## [1] state name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
##
## Raw file for vax:
## Rows: 14,658
## Columns: 69
## $ date <date> 2021-07-27, 2021-07-27, 2021-0~
## $ MMWR_week <dbl> 30, 30, 30, 30, 30, 30, 30, 30,~
## $ state <chr> "PA", "WV", "VA", "MP", "AL", "~
## $ Distributed <dbl> 16160625, 1951775, 10745415, 76~
## $ Distributed_Janssen <dbl> 1060700, 110200, 545400, 2600, ~
## $ Distributed_Moderna <dbl> 6922840, 892060, 4198860, 20200~
## $ Distributed_Pfizer <dbl> 8177085, 949515, 6001155, 54030~
## $ Distributed_Unk_Manuf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Dist_Per_100K <dbl> 126235, 108907, 125891, 135069,~
## $ Distributed_Per_100k_12Plus <dbl> 145834, 125287, 147057, 158165,~
## $ Distributed_Per_100k_18Plus <dbl> 158946, 136242, 160988, 173415,~
## $ Distributed_Per_100k_65Plus <dbl> 675227, 531803, 790740, 797157,~
## $ vxa <dbl> 14448982, 1484938, 9615043, 605~
## $ Administered_12Plus <dbl> 14444707, 1484027, 9584540, 605~
## $ Administered_18Plus <dbl> 13737829, 1419165, 8974403, 547~
## $ Administered_65Plus <dbl> 4417620, 535420, 2296788, 5449,~
## $ Administered_Janssen <dbl> 565091, 37957, 348579, 526, 118~
## $ Administered_Moderna <dbl> 5944305, 680353, 3631930, 8207,~
## $ Administered_Pfizer <dbl> 7939050, 766278, 5631815, 51865~
## $ Administered_Unk_Manuf <dbl> 536, 350, 2719, 0, 4, 34, 0, 0,~
## $ Administered_Fed_LTC <dbl> 433655, 25905, 215562, 1, 90359~
## $ Administered_Fed_LTC_Residents <dbl> 220783, 4, 109605, 0, 48007, 76~
## $ Administered_Fed_LTC_Staff <dbl> 193617, 2, 80422, 0, 32668, 210~
## $ Administered_Fed_LTC_Unk <dbl> 19255, 25899, 25535, 1, 9684, 2~
## $ Administered_Fed_LTC_Dose1 <dbl> 240534, 13452, 120044, 1, 55026~
## $ Administered_Fed_LTC_Dose1_Residents <dbl> 120011, 4, 58837, 0, 28449, 458~
## $ Administered_Fed_LTC_Dose1_Staff <dbl> 106985, 1, 44545, 0, 20493, 139~
## $ Administered_Fed_LTC_Dose1_Unk <dbl> 13538, 13447, 16662, 1, 6084, 1~
## $ Admin_Per_100k <dbl> 112865, 82858, 112647, 106533, ~
## $ Admin_Per_100k_12Plus <dbl> 130349, 95262, 131170, 124749, ~
## $ Admin_Per_100k_18Plus <dbl> 135117, 99064, 134455, 123614, ~
## $ Admin_Per_100k_65Plus <dbl> 184578, 145887, 169017, 56537, ~
## $ Recip_Administered <dbl> 14478510, 1484445, 9631592, 606~
## $ Administered_Dose1_Recip <dbl> 8324369, 822974, 5227876, 32039~
## $ Administered_Dose1_Pop_Pct <dbl> 65.0, 45.9, 61.2, 56.3, 42.3, 5~
## $ Administered_Dose1_Recip_12Plus <dbl> 8321311, 822365, 5209842, 32039~
## $ Administered_Dose1_Recip_12PlusPop_Pct <dbl> 75.1, 52.8, 71.3, 66.0, 49.5, 6~
## $ Administered_Dose1_Recip_18Plus <dbl> 7907600, 784742, 4880306, 28690~
## $ Administered_Dose1_Recip_18PlusPop_Pct <dbl> 77.8, 54.8, 73.1, 64.8, 52.6, 6~
## $ Administered_Dose1_Recip_65Plus <dbl> 2465111, 285837, 1223319, 2809,~
## $ Administered_Dose1_Recip_65PlusPop_Pct <dbl> 99.9, 77.9, 90.0, 29.1, 80.4, 9~
## $ vxc <dbl> 6645348, 698205, 4626792, 29118~
## $ vxcpoppct <dbl> 51.9, 39.0, 54.2, 51.2, 34.1, 4~
## $ Series_Complete_12Plus <dbl> 6644090, 697917, 4615059, 29118~
## $ Series_Complete_12PlusPop_Pct <dbl> 60.0, 44.8, 63.2, 59.9, 39.9, 5~
## $ vxcgte18 <dbl> 6355406, 671008, 4340052, 26632~
## $ vxcgte18pct <dbl> 62.5, 46.8, 65.0, 60.1, 42.8, 5~
## $ vxcgte65 <dbl> 1997265, 257577, 1115550, 2716,~
## $ vxcgte65pct <dbl> 83.5, 70.2, 82.1, 28.2, 69.3, 8~
## $ Series_Complete_Janssen <dbl> 567898, 39058, 342545, 532, 121~
## $ Series_Complete_Moderna <dbl> 2601426, 309693, 1690439, 3945,~
## $ Series_Complete_Pfizer <dbl> 3475610, 349224, 2592941, 24641~
## $ Series_Complete_Unk_Manuf <dbl> 414, 230, 867, 0, 179, 18, 0, 3~
## $ Series_Complete_Janssen_12Plus <dbl> 567854, 39044, 342458, 532, 121~
## $ Series_Complete_Moderna_12Plus <dbl> 2601297, 309608, 1690272, 3945,~
## $ Series_Complete_Pfizer_12Plus <dbl> 3474525, 349035, 2581462, 24641~
## $ Series_Complete_Unk_Manuf_12Plus <dbl> 414, 230, 867, 0, 179, 18, 0, 3~
## $ Series_Complete_Janssen_18Plus <dbl> 567567, 39003, 341071, 532, 121~
## $ Series_Complete_Moderna_18Plus <dbl> 2599676, 309403, 1685252, 3944,~
## $ Series_Complete_Pfizer_18Plus <dbl> 3187774, 322390, 2312883, 22156~
## $ Series_Complete_Unk_Manuf_18Plus <dbl> 389, 212, 846, 0, 179, 18, 0, 3~
## $ Series_Complete_Janssen_65Plus <dbl> 78150, 6283, 63400, 73, 31781, ~
## $ Series_Complete_Moderna_65Plus <dbl> 945835, 131721, 531139, 438, 31~
## $ Series_Complete_Pfizer_65Plus <dbl> 973096, 119473, 520519, 2205, 2~
## $ Series_Complete_Unk_Manuf_65Plus <dbl> 184, 100, 492, 0, 113, 11, 0, 3~
## $ Series_Complete_FedLTC <dbl> 192384, 12528, 88488, 0, 35859,~
## $ Series_Complete_FedLTC_Residents <dbl> 99960, 4, 45714, 0, 19862, 3115~
## $ Series_Complete_FedLTC_Staff <dbl> 85972, 1, 32869, 0, 12295, 736,~
## $ Series_Complete_FedLTC_Unknown <dbl> 6452, 12523, 9905, 0, 3702, 142~
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 9
## isType vxa vxc vxcpoppct vxcgte65 vxcgte65pct vxcgte18 vxcgte18pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 7.90e+10 3.24e+10 290630. 1.07e+10 543665. 3.16e+10 359306.
## 2 after 3.76e+10 1.57e+10 245996. 5.17e+ 9 495708. 1.53e+10 308154.
## 3 pctchg 5.24e- 1 5.16e- 1 0.154 5.16e- 1 0.0882 5.16e- 1 0.142
## # ... with 1 more variable: n <dbl>
##
##
## Processed for vax:
## Rows: 11,526
## Columns: 9
## $ date <date> 2021-07-27, 2021-07-27, 2021-07-27, 2021-07-27, 2021-07-2~
## $ state <chr> "PA", "WV", "VA", "AL", "SD", "OK", "ND", "IN", "MD", "HI"~
## $ vxa <dbl> 14448982, 1484938, 9615043, 3534356, 845474, 3422699, 6587~
## $ vxc <dbl> 6645348, 698205, 4626792, 1672423, 413425, 1582479, 304231~
## $ vxcpoppct <dbl> 51.9, 39.0, 54.2, 34.1, 46.7, 40.0, 39.9, 44.0, 58.5, 53.3~
## $ vxcgte65 <dbl> 1997265, 257577, 1115550, 588862, 131193, 481244, 89586, 8~
## $ vxcgte65pct <dbl> 83.5, 70.2, 82.1, 69.3, 86.4, 75.8, 74.8, 81.5, 87.8, 84.5~
## $ vxcgte18 <dbl> 6355406, 671008, 4340052, 1634263, 397774, 1524717, 294024~
## $ vxcgte18pct <dbl> 62.5, 46.8, 65.0, 42.8, 59.6, 50.7, 50.5, 54.9, 70.5, 64.4~
##
## Integrated per capita data file:
## Rows: 11,526
## Columns: 13
## $ date <date> 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-1~
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL"~
## $ vxa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxcpoppct <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxcgte65 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxcgte65pct <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxcgte18 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxcgte18pct <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxapm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxcpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ vxapm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
cdc_daily_210728_vaxonly$dfPerCapita %>%
select(date, state, vxapm, vxcpm, vxcgte65pct, vxcgte18pct) %>%
pivot_longer(-c(state, date)) %>%
mutate(value=ifelse(name %in% c("vxapm", "vxcpm"), value/1000000, value/100)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=state), color="gray", size=0.5) +
geom_line(data=~summarize(group_by(., date, name), value=median(value, na.rm=TRUE), .groups="drop"),
aes(color=name),
size=1.5
) +
facet_wrap(~c("vxapm"="1. Doses administered per capita",
"vxcpm"="2. Fully vaccinated per capita",
"vxcgte18pct"="4. Fully vaccinated per capita (18+)",
"vxcgte65pct"="3. Fully vaccinated per capita (65+)"
)[name]) +
labs(x=NULL,
y=NULL,
title="Vaccinations by state (%)",
subtitle="Colored line is median of 50 states/DC"
) +
geom_hline(yintercept=c(0.5, 1), lty=2, size=0.25) +
theme(legend.position="none")
The process appears to read and process vaccines data as intended. Next steps are to update the plotting routines for segments to include vaccines data.
A plot is created for the per-capita hospitalizations by state, with vaccines data overlaid:
newHosp_210727 <- readFromRDS("cdc_daily_210727")[["dfRaw"]][["cdcHosp"]] %>%
select(state, date, contains("_covid_confirmed"), -contains("coverage")) %>%
group_by(date, grp=state) %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE), .groups="drop") %>%
pivot_longer(-c(date, grp)) %>%
filter(value >= 0, grepl(pattern="_[2-8]", name), date >= "2020-08-01") %>%
mutate(name=stringr::str_replace(name, pattern="previous_day_admission_", replacement="")) %>%
group_by(name, grp) %>%
helperRollingAgg(origVar="value", newName="value7") %>% ungroup() %>%
filter(!is.na(value7)) %>%
inner_join(getStateData(keepVars=c("state", "pop")), by=c("grp"="state"))
newHosp_210727 %>%
ggplot(aes(x=date, y=1000000*value7/pop)) +
geom_col(aes(fill=stringr::str_replace(name, pattern="adult_covid_confirmed_", replacement="")),
position="stack"
) +
geom_line(data=cdc_daily_210728_vaxonly$dfPerCapita %>%
select(date, grp=state, vxcpm) %>%
mutate(vxcpct=100*vxcpm/1000000) %>%
filter(date >= "2020-08-01"),
aes(x=date, y=vxcpct),
lty=2
) +
facet_wrap(~grp) +
scale_fill_discrete("Age Bucket") +
labs(x=NULL,
y="Newly hospitalized (per million total population in state)",
title="Newly hospitalized for COVID (Aug 2020 - Jul 2021)",
subtitle="Denominator is always total state population, not population by cohort"
) +
scale_x_date(date_breaks="3 months", date_labels="%b-%y") +
scale_y_continuous(sec.axis=sec_axis(~., name="Percent fully vaccinated (dotted line)")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position="bottom"
) +
guides(fill = guide_legend(nrow = 1))
Plotting on the secondary axis should be updated so it does not depend on the current good fortune that maximum newly hospitalized per day is 100.
Function diagnoseClusters() and related functions are updated to include vaccines data:
# Updated function for more flexibility in creating weighted aggregates
combineAggData <- function(df,
aggTo=c("cluster", "date"),
wm_aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
aggBy=list("agg1"=list(aggFunc=specNA(specSumProd),
aggVars=c("pop"),
wtVar=NULL,
prefix=NULL
),
"agg2"=list(aggFunc=specNA(weighted.mean),
aggVars=wm_aggVars,
wtVar="pop",
prefix="wm_"
)
),
fnJoin=dplyr::full_join
) {
# FUNCTION ARGUMENTS:
# df: a data frame containing data for summarizing to an aggregate
# aggTo: the level to which data should be aggregated
# wm_aggVars: variables for which a population weighted mean should be created for the cluster total
# aggBy: a list of lists directing the creation of aggregates
# fnJoin: a function for joining the relevant aggTo aggregates
# Create an empty aggregation list
aggList <- list()
# Create the aggregates based on the instruction list
# Potentially update later so that the instructions can be skipped when the defaults should be used
for (ctr in seq_along(aggBy)) {
aggList[[ctr]] <- df %>%
createGroupAgg(aggTo=aggTo,
aggFunc=aggBy[[ctr]]$aggFunc,
aggVars=aggBy[[ctr]]$aggVars,
wtVar=aggBy[[ctr]]$wtVar,
prefix=aggBy[[ctr]]$prefix
)
}
# Return the joined data
joinFrames(aggList, fnJoin=fnJoin, keyJoin=aggTo)
}
# Updated for more options in plot 4, including vaccinated (total and 65+)
createSummary <- function(df,
p4AggVars=c("wm_tcpm7", "wm_tdpm7", "wm_hpm7"),
stateClusterDF=NULL,
brewPalette=NA
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# p4AggVars: variables to be used for aggregates in plot 4
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=c("RI", "CT", "DE", "MD", "DC"),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=p4AggVars,
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)",
"wm_vxcpm7"="Fully vaccinated\n(% of total pop)",
"wm_vxcgte65pct"="Fully vaccinated\n(% of total pop 65+)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000, "wm_vxcpm7"=1000000, "wm_vxcgte65pct"=100),
linesize=0.75,
extraArgs=p4xtra
)
list(p1=p1, p2=p2, p3=p3, p4=p4)
}
# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst,
lstExtract=fullListExtract,
clusterFrame=clustersToFrame(lst),
wm_aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
brewPalette=NA,
printSummary=TRUE,
printDetailed=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: a list containing processed clustering data
# lstExtract: the elements to extract from lst with an optional function for converting the elements
# NULL means use the extracted element as-is
# clusterFrame: the clusters to be plotted (default is to match to useClusters)
# wm_aggVars: variables where a population-weighted mean should be produced for the cluster-aggregate
# brewPalette: the color palette to use with scale_*_brewer()
# default NA means use the standard color/fill profile
# printSummary: boolean, should summary plots be printed to the log?
# printDetailed: boolean, should detailed plots be printed to the log?
# Create the integrated and aggregate data from lst
dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame))
dfAgg <- combineAggData(dfFull, wm_aggVars=wm_aggVars)
# Create the main summary plots
summaryPlots <- createSummary(dfAgg, stateClusterDF=clusterFrame, brewPalette=brewPalette)
# Create the detailed summaries
detPlots <- createDetailedSummaries(dfDetail=dfFull, dfAgg=dfAgg, brewPalette=brewPalette)
# Print the summary plots if requested
if (isTRUE(printSummary)) {
gridExtra::grid.arrange(summaryPlots$p1 + theme(legend.position="none"),
summaryPlots$p3 + theme(legend.position="left"),
summaryPlots$p4,
layout_matrix=rbind(c(1, 2),
c(3, 3)
)
)
}
# Print the detailed plots if requested
if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
# Return a list of the key plotting files
list(dfFull=dfFull,
dfAgg=dfAgg,
plotClusters=clusterFrame,
summaryPlots=summaryPlots,
detPlots=detPlots
)
}
The updated functions are run to confirm that they still work with legacy (no vaccines) data:
# Create the cluster assessments
plotDataList_210727 <- diagnoseClusters(lst=list("stateData"=readFromRDS("cdc_daily_210727")[["stateData"]],
"dfPerCapita"=readFromRDS("cdc_daily_210727")[["dfPerCapita"]],
"useClusters"=readFromRDS("cdc_daily_210727")[["useClusters"]]
),
lstExtract=list("stateData"=
function(x) colSelector(x, vecSelect=c("state", "pop")),
"dfPerCapita"=NULL
),
brewPalette="Paired"
)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
The defaults continue to work for legacy (no vaccines) data.
Vaccines data are integrated with burden data for a full per-capita file, and diagnoseClusters() is further updated for a complete plot by cluster:
# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst,
lstExtract=fullListExtract,
clusterFrame=clustersToFrame(lst),
wm_aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
summaryAggVars=c("wm_tcpm7", "wm_tdpm7", "wm_hpm7"),
detailAggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
brewPalette=NA,
printSummary=TRUE,
printDetailed=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: a list containing processed clustering data
# lstExtract: the elements to extract from lst with an optional function for converting the elements
# NULL means use the extracted element as-is
# clusterFrame: the clusters to be plotted (default is to match to useClusters)
# wm_aggVars: variables where a population-weighted mean should be produced for the cluster-aggregate
# summaryAggVars: variables to be included in plot 4 for the overall summary
# detailAggVars: variables to be included in plot 2 for the detailed summary
# brewPalette: the color palette to use with scale_*_brewer()
# default NA means use the standard color/fill profile
# printSummary: boolean, should summary plots be printed to the log?
# printDetailed: boolean, should detailed plots be printed to the log?
# Create the integrated and aggregate data from lst
dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame))
dfAgg <- combineAggData(dfFull, wm_aggVars=wm_aggVars)
# Create the main summary plots
summaryPlots <- createSummary(dfAgg,
stateClusterDF=clusterFrame,
brewPalette=brewPalette,
p4AggVars=summaryAggVars
)
# Create the detailed summaries
detPlots <- createDetailedSummaries(dfDetail=dfFull,
dfAgg=dfAgg,
brewPalette=brewPalette,
p2DetMetrics=detailAggVars,
mapper=c("tcpm"="Cases per million\n(cumulative)",
"tdpm"="Deaths per million\n(cumulative)",
"cpm7"="Cases\nper million",
"dpm7"="Deaths\nper million",
"hpm7"="Hospitalized\nper million",
"tdpm7"="Deaths (cum)\nper million",
"tcpm7"="Cases (cum)\nper million",
"vxcpm7"="Fully vaccinated\nper million",
"vxcgte65pct"="Fully vacinated\n65+ (%)"
)
)
# Print the summary plots if requested
if (isTRUE(printSummary)) {
gridExtra::grid.arrange(summaryPlots$p1 + theme(legend.position="none"),
summaryPlots$p3 + theme(legend.position="left"),
summaryPlots$p4,
layout_matrix=rbind(c(1, 2),
c(3, 3)
)
)
}
# Print the detailed plots if requested
if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
# Return a list of the key plotting files
list(dfFull=dfFull,
dfAgg=dfAgg,
plotClusters=clusterFrame,
summaryPlots=summaryPlots,
detPlots=detPlots
)
}
vaxNames <- names(vaxPerCap_210717_func_v4_new)
burdenNames <- names(readFromRDS("cdc_daily_210727")[["dfPerCapita"]])
cat("\nVariables in both burden data and vaccines data are:", intersect(vaxNames, burdenNames), "\n")
##
## Variables in both burden data and vaccines data are: state date
# Create updated dfPerCapita
dfPerCapita_210716 <- readFromRDS("cdc_daily_210727")[["dfPerCapita"]] %>%
filter(date <= "2021-07-16") %>%
left_join(vaxPerCap_210717_func_v4_new, by=c("state", "date")) %>%
checkUniqueRows(uniqueBy=c("state", "date"))
##
## *** File has been checked for uniqueness by: state date
# Create the cluster assessments
plotDataList_210716 <- diagnoseClusters(lst=list("stateData"=readFromRDS("cdc_daily_210727")[["stateData"]],
"dfPerCapita"=dfPerCapita_210716,
"useClusters"=readFromRDS("cdc_daily_210727")[["useClusters"]]
),
lstExtract=list("stateData"=
function(x) colSelector(x, vecSelect=c("state", "pop")),
"dfPerCapita"=NULL
),
wm_aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7",
"vxcpm7", "vxcgte65pct"
),
detailAggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7",
"vxcpm7", "vxcgte65pct"
),
brewPalette="Paired"
)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
An updated version of readRunCDCDaily() is written to allow for varying aggregate variables, and new data are downloaded, processed, and summarized:
# Function to download/load, process, segment, and analyze data for CDC daily
readRunCDCDaily <- function(thruLabel,
downloadTo=list("cdcDaily"=NA, "cdcHosp"=NA, "vax"=NA),
readFrom=downloadTo,
compareFile=list("cdcDaily"=NA, "cdcHosp"=NA, "vax"=NA),
writeLog=NULL,
ovrwriteLog=TRUE,
dfPerCapita=NULL,
useClusters=NULL,
hierarchical=TRUE,
returnList=!isTRUE(hierarchical),
kCut=6,
reAssignState=vector("list", 0),
weightedMeanAggs=eval(formals(diagnoseClusters)$wm_aggVars),
detailedPlotAggs=weightedMeanAggs,
skipAssessmentPlots=FALSE,
brewPalette=NA,
...
) {
# FUNCTION ARGUMENTS:
# thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
# donwloadTo: named list for locations to download data (cdcDaily and cdcHosp)
# NA means do not download data for that particular element
# readFrom: named list for locations to read data from (defaults to donwloadTo)
# compareFile: named list for the reference file to be used for cdcDaily and cdcHosp
# NA means do not use a reference file for that element
# dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
# dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
# writeLog: name of a separate log file for capturing detailed data on changes between files
# NULL means no detailed data captured
# ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
# dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
# default NULL means create dfPerCapita using steps 2-4
# useClusters: file containing clusters by state (NULL means make the clusters from the data)
# hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
# returnList: boolean, should a list be returned or just the cluster object?
# refers to what is returned by clusterStates(); the main function always returns a list
# kCut: number of segments when cutting the hierarchical tree
# reAssignState: mapping file for assigning a state to another state's cluster
# format list("stateToChange"="stateClusterToAssign")
# weightedMeanAggs: variables where a population-weighted cluster mean should be created
# detailedPlotAggs: variables that should be included in the cluster-level disease evolution detailed plots
# skipAssessmentPlots: boolean to skip the plots for assessClusters()
# especially useful if just exploring dendrograms or silhouette widths
# brewPalette: create plots using this color scheme (needs to be valid in ggplot2::scale_*_brewer())
# NA means use R default color schemes
# ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
# STEP 0: Function to create the return list
createFinalList <- function(plots=TRUE) {
list(stateData=stateData,
dfRaw=dfRawList,
dfProcess=dfProcessList,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
plotDataList=if(plots) plotDataList else NULL
)
}
# STEP 1: Get state data
stateData <- getStateData()
# If a log file is requested, create the log file (allows for append=TRUE for all downstream functions)
if (!is.null(writeLog)) genNewLog(writeLog=writeLog, ovrwriteLog=ovrwriteLog)
# Get the data types to be used (will be the elements of readFrom) and create a file storage list
elemUsed <- names(readFrom)
dfRawList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
dfProcessList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
# Steps 2-4 are run only is dfPerCapita has not been passed
if (is.null(dfPerCapita)) {
# Step 2: Download and QC all of the requested data
for (elem in elemUsed) {
dfRawList[[elem]] <- readQCRawCDCDaily(fileName=readFrom[[elem]],
writeLog=writeLog,
ovrwriteLog=FALSE,
urlType=elem,
getData=if(is.na(downloadTo[[elem]])) FALSE else TRUE,
dfRef=compareFile[[elem]]
)
glimpseLog(dfRawList[[elem]], txt=paste0("\nRaw file for ", elem, ":\n"), logFile=writeLog)
}
# Step 3: Process all of the requested data
for (elem in elemUsed) {
dfProcessList[[elem]] <- processRawFile(dfRawList[[elem]],
vecRename=c(), # already handled in readQCRawCDCDaily()
vecSelect=vecSelectMapper[[elem]],
lstCombo=lstComboMapper[[elem]],
lstFilter=lstFilterMapper[[elem]]
)
glimpseLog(dfProcessList[[elem]], txt=paste0("\nProcessed for ", elem, ":\n"), logFile=writeLog)
}
# Step 4: Integrate in to a dfPerCapita file and glimpse (to specified log file)
dfPerCapita <- createPerCapita(dfProcessList,
uqBy=c("state", "date"),
popData=stateData,
mapper=perCapMapper,
asIsVars=if(isTRUE(exists("asIsMapper"))) asIsMapper[[elem]] else c()
)
glimpseLog(dfPerCapita, txt="\nIntegrated per capita data file:\n", logFile=writeLog)
} else {
dfRawList <- NULL
dfProcessList <- NULL
}
# STEP 5: Create the clusters (if they have not been passed)
if (is.null(useClusters)) {
clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
useClusters <- getClusters(clData, hier=hierarchical, kCut=kCut, reAssign=reAssignState)
}
# STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
if (skipAssessmentPlots)
return(createFinalList(plots=FALSE))
# STEP 6: Create the cluster assessments
lstFuns <- list("stateData"=function(x) colSelector(x, vecSelect=c("state", "pop")),
"dfPerCapita"=NULL
)
plotDataList <- diagnoseClusters(lst=list("stateData"=stateData,
"dfPerCapita"=dfPerCapita,
"useClusters"=useClusters
),
lstExtract=lstFuns,
wm_aggVars=weightedMeanAggs,
detailAggVars=detailedPlotAggs,
brewPalette=brewPalette
)
# STEP 7: Return a list of the key data
return(createFinalList(plots=TRUE))
}
New data are downloaded and processed, with summaries produced using existing state-level segments:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210801.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210801.csv",
"vax"="./RInputFiles/Coronavirus/vaxData_downloaded_210801.csv"
)
compareList <- list("cdcDaily"=readFromRDS("cdc_daily_210708")$dfRaw$cdcDaily,
"cdcHosp"=readFromRDS("cdc_daily_210708")$dfRaw$cdcHosp,
"vax"=cdc_daily_210728_vaxonly$dfRaw$vax
)
cdc_daily_210801 <- readRunCDCDaily(thruLabel="Jul 31, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog=NULL,
useClusters=readFromRDS("cdc_daily_210528")$useClusters,
weightedMeanAggs=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7",
"vxcpm7", "vxcgte65pct"
),
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_dc_downloaded_210801.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 25
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## date name newValue refValue absDelta pctDelta
## 1 2020-02-02 tot_deaths 143 152 9 0.06101695
## 2 2020-02-03 tot_deaths 143 152 9 0.06101695
## 3 2020-02-04 tot_deaths 143 152 9 0.06101695
## 4 2020-02-05 tot_deaths 143 152 9 0.06101695
## 5 2020-02-06 tot_deaths 143 152 9 0.06101695
## 6 2020-02-07 tot_deaths 143 152 9 0.06101695
## 7 2020-02-08 tot_deaths 144 153 9 0.06060606
## 8 2020-02-09 tot_deaths 144 153 9 0.06060606
## 9 2020-02-10 tot_deaths 144 153 9 0.06060606
## 10 2020-02-11 tot_deaths 144 153 9 0.06060606
## 11 2020-02-12 tot_deaths 144 153 9 0.06060606
## 12 2020-02-13 tot_deaths 144 153 9 0.06060606
## 13 2020-02-14 tot_deaths 144 153 9 0.06060606
## 14 2020-02-15 tot_deaths 144 153 9 0.06060606
## 15 2020-02-16 tot_deaths 144 153 9 0.06060606
## 16 2020-02-17 tot_deaths 144 153 9 0.06060606
## 17 2020-02-18 tot_deaths 144 153 9 0.06060606
## 18 2020-02-19 tot_deaths 145 154 9 0.06020067
## 19 2020-02-20 tot_deaths 145 154 9 0.06020067
## 20 2020-02-21 tot_deaths 145 154 9 0.06020067
## 21 2020-02-22 tot_deaths 145 154 9 0.06020067
## 22 2020-02-23 tot_deaths 145 154 9 0.06020067
## 23 2020-02-24 tot_deaths 145 154 9 0.06020067
## 24 2020-02-25 tot_deaths 145 154 9 0.06020067
## 25 2020-02-26 tot_deaths 145 154 9 0.06020067
## 26 2020-02-27 tot_deaths 146 155 9 0.05980066
## 27 2020-02-28 tot_deaths 146 155 9 0.05980066
## 28 2020-02-29 tot_deaths 147 156 9 0.05940594
## 29 2020-03-01 tot_deaths 147 156 9 0.05940594
## 30 2020-03-02 tot_deaths 153 162 9 0.05714286
## 31 2020-03-03 tot_deaths 156 165 9 0.05607477
## 32 2020-03-04 tot_deaths 158 167 9 0.05538462
## 33 2020-03-05 tot_deaths 160 169 9 0.05471125
## 34 2020-03-06 tot_deaths 163 172 9 0.05373134
## 35 2020-03-07 tot_deaths 168 177 9 0.05217391
## 36 2020-03-08 tot_deaths 173 182 9 0.05070423
## 37 2020-02-02 tot_cases 510 612 102 0.18181818
## 38 2020-02-03 tot_cases 542 644 102 0.17200675
## 39 2020-02-04 tot_cases 550 652 102 0.16971714
## 40 2020-02-05 tot_cases 555 657 102 0.16831683
## 41 2020-02-06 tot_cases 557 658 101 0.16625514
## 42 2020-02-07 tot_cases 562 663 101 0.16489796
## 43 2020-02-08 tot_cases 570 670 100 0.16129032
## 44 2020-02-09 tot_cases 605 705 100 0.15267176
## 45 2020-02-10 tot_cases 614 713 99 0.14920874
## 46 2020-02-11 tot_cases 625 721 96 0.14264487
## 47 2020-02-12 tot_cases 635 731 96 0.14055637
## 48 2020-02-13 tot_cases 641 736 95 0.13798112
## 49 2020-02-14 tot_cases 649 743 94 0.13505747
## 50 2020-02-15 tot_cases 654 748 94 0.13409415
## 51 2020-02-16 tot_cases 667 758 91 0.12771930
## 52 2020-02-17 tot_cases 685 776 91 0.12457221
## 53 2020-02-18 tot_cases 692 783 91 0.12338983
## 54 2020-02-19 tot_cases 709 799 90 0.11936340
## 55 2020-02-20 tot_cases 723 811 88 0.11473272
## 56 2020-02-21 tot_cases 742 829 87 0.11075748
## 57 2020-02-22 tot_cases 768 855 87 0.10720887
## 58 2020-02-23 tot_cases 792 877 85 0.10185740
## 59 2020-02-24 tot_cases 811 896 85 0.09958992
## 60 2020-02-25 tot_cases 835 920 85 0.09686610
## 61 2020-02-26 tot_cases 879 963 84 0.09120521
## 62 2020-02-27 tot_cases 916 998 82 0.08568443
## 63 2020-02-28 tot_cases 968 1049 81 0.08031730
## 64 2020-02-29 tot_cases 1005 1087 82 0.07839388
## 65 2020-03-01 tot_cases 1094 1177 83 0.07309555
## 66 2020-03-02 tot_cases 1172 1254 82 0.06760099
## 67 2020-03-03 tot_cases 1343 1424 81 0.05854716
## 68 2020-03-04 tot_cases 1482 1565 83 0.05447982
## 69 2021-07-05 new_deaths 104 37 67 0.95035461
## 70 2021-07-04 new_deaths 98 38 60 0.88235294
## 71 2021-01-18 new_deaths 2674 1130 1544 0.81177708
## 72 2021-07-03 new_deaths 140 86 54 0.47787611
## 73 2021-01-19 new_deaths 3036 4578 1542 0.40504334
## 74 2020-12-26 new_deaths 2248 3093 845 0.31642015
## 75 2020-12-24 new_deaths 3274 2463 811 0.28272616
## 76 2021-06-27 new_deaths 139 105 34 0.27868852
## 77 2021-06-20 new_deaths 176 145 31 0.19314642
## 78 2021-06-26 new_deaths 172 142 30 0.19108280
## 79 2021-06-19 new_deaths 180 154 26 0.15568862
## 80 2021-06-28 new_deaths 193 170 23 0.12672176
## 81 2021-06-24 new_deaths 287 258 29 0.10642202
## 82 2021-06-17 new_deaths 334 302 32 0.10062893
## 83 2021-06-23 new_deaths 310 281 29 0.09813875
## 84 2021-06-25 new_deaths 300 273 27 0.09424084
## 85 2021-06-22 new_deaths 283 258 25 0.09242144
## 86 2021-06-18 new_deaths 210 192 18 0.08955224
## 87 2021-06-13 new_deaths 200 184 16 0.08333333
## 88 2021-05-30 new_deaths 237 220 17 0.07439825
## 89 2020-03-21 new_deaths 114 107 7 0.06334842
## 90 2021-06-11 new_deaths 326 306 20 0.06329114
## 91 2021-06-16 new_deaths 310 293 17 0.05638474
## 92 2021-06-15 new_deaths 336 319 17 0.05190840
## 93 2020-02-02 new_cases 1 557 556 1.99283154
## 94 2021-07-05 new_cases 11563 3575 7988 1.05535738
## 95 2021-07-04 new_cases 12794 4156 8638 1.01923304
## 96 2021-07-03 new_cases 14978 5887 9091 0.87141145
## 97 2021-06-10 new_cases 16732 12363 4369 0.30032652
## 98 2021-01-18 new_cases 138853 107646 31207 0.25320184
## 99 2021-01-19 new_cases 145009 176292 31283 0.19472706
## 100 2021-07-02 new_cases 16830 14183 2647 0.17070261
## 101 2021-06-20 new_cases 9228 7787 1441 0.16937996
## 102 2020-12-24 new_cases 222824 195402 27422 0.13113484
## 103 2021-06-01 new_cases 9689 8540 1149 0.12606287
## 104 2021-01-29 new_cases 156344 139722 16622 0.11228577
## 105 2020-12-26 new_cases 151874 169350 17476 0.10880881
## 106 2021-06-30 new_cases 17295 15526 1769 0.10779684
## 107 2021-06-28 new_cases 9690 8701 989 0.10755261
## 108 2021-06-09 new_cases 19404 21526 2122 0.10368923
## 109 2021-01-09 new_cases 249812 226455 23357 0.09808364
## 110 2021-01-30 new_cases 137321 150808 13487 0.09361779
## 111 2021-07-01 new_cases 18730 17149 1581 0.08812955
## 112 2021-06-08 new_cases 14356 15667 1311 0.08733304
## 113 2021-06-29 new_cases 16159 15051 1108 0.07100288
## 114 2021-06-06 new_cases 12102 11304 798 0.06818764
## 115 2020-07-14 new_cases 65684 61818 3866 0.06064219
## 116 2021-01-08 new_cases 295289 312357 17068 0.05617745
## 117 2021-05-24 new_cases 15657 14828 829 0.05438740
## 118 2021-05-31 new_cases 9193 9700 507 0.05367067
## 119 2021-05-03 new_cases 33239 31601 1638 0.05052437
## 120 2021-06-07 new_cases 10122 10644 522 0.05027449
## 121 2020-07-15 new_cases 70320 73939 3619 0.05017365
##
##
## ***Differences of at least 0 and at least 0.1%
##
## state name newValue refValue absDelta pctDelta
## 1 IN tot_deaths 3407157 3378244 28913 0.008522120
## 2 SC tot_deaths 2291589 2305862 14273 0.006209093
## 3 CA tot_deaths 14183041 14129523 53518 0.003780512
## 4 NC tot_deaths 3073917 3062861 11056 0.003603194
## 5 MS tot_deaths 1998075 1991323 6752 0.003384972
## 6 KY tot_deaths 1634463 1630052 4411 0.002702392
## 7 RI tot_deaths 749883 751479 1596 0.002126070
## 8 NM tot_deaths 1001515 999916 1599 0.001597857
## 9 AL tot_deaths 2742024 2738028 3996 0.001458380
## 10 CA tot_cases 865747767 837321729 28426038 0.033382123
## 11 SC tot_cases 129358076 129977727 619651 0.004778754
## 12 RI tot_cases 32453898 32591078 137180 0.004218004
## 13 AL tot_cases 131847795 131406619 441176 0.003351708
## 14 MI tot_cases 214132223 214386719 254496 0.001187793
## 15 MS tot_cases 77187328 77104046 83282 0.001079542
## 16 MS new_deaths 7432 7332 100 0.013546464
## 17 NM new_deaths 4382 4344 38 0.008709603
## 18 CA new_deaths 63517 62992 525 0.008299805
## 19 KY new_deaths 7285 7229 56 0.007716687
## 20 NC new_deaths 13517 13434 83 0.006159326
## 21 AL new_deaths 11430 11360 70 0.006143045
## 22 MI new_deaths 21076 20995 81 0.003850633
## 23 IN new_deaths 13914 13863 51 0.003672103
## 24 TX new_deaths 51507 51349 158 0.003072256
## 25 TN new_deaths 12611 12576 35 0.002779211
## 26 WA new_deaths 5954 5939 15 0.002522492
## 27 RI new_deaths 2736 2730 6 0.002195390
## 28 UT new_deaths 2371 2368 3 0.001266090
## 29 CA new_cases 3880232 3713944 166288 0.043793560
## 30 VI new_cases 3932 3916 16 0.004077472
## 31 MS new_cases 323003 321780 1223 0.003793524
## 32 AL new_cases 554270 552325 1945 0.003515288
## 33 LA new_cases 483605 482096 1509 0.003125191
## 34 NV new_cases 335771 334763 1008 0.003006559
## 35 FL new_cases 2344516 2337613 6903 0.002948659
## 36 WY new_cases 62592 62445 147 0.002351304
## 37 UT new_cases 416971 416110 861 0.002067026
## 38 KS new_cases 319154 318515 639 0.002004175
## 39 WA new_cases 453368 452483 885 0.001953964
## 40 AK new_cases 68595 68478 117 0.001707120
## 41 MI new_cases 1002081 1000375 1706 0.001703908
## 42 OR new_cases 209377 209035 342 0.001634752
## 43 NC new_cases 1015407 1014359 1048 0.001032631
##
##
##
## Raw file for cdcDaily:
## Rows: 33,360
## Columns: 15
## $ date <date> 2021-02-02, 2020-07-30, 2020-05-03, 2020-12-04, 2021-0~
## $ state <chr> "IL", "ME", "NH", "IN", "CA", "GU", "CT", "WI", "NV", "~
## $ tot_cases <dbl> 1130917, 3910, 2518, 367338, 3409079, 0, 267337, 98440,~
## $ conf_cases <dbl> 1130917, 3497, NA, NA, 3285871, NA, 250915, 92712, NA, ~
## $ prob_cases <dbl> 0, 413, NA, NA, 123208, NA, 16422, 5728, NA, 105447, NA~
## $ new_cases <dbl> 2304, 22, 89, 7899, 18703, 0, 0, 1502, 128, 199, 0, 394~
## $ pnew_case <dbl> 0, 2, 0, 0, 892, NA, 0, 94, 0, 47, NA, 5, 102, NA, 0, 0~
## $ tot_deaths <dbl> 21336, 123, 86, 7031, 49603, 0, 7381, 1237, 5586, 21047~
## $ conf_death <dbl> 19306, 122, NA, 6746, 49603, NA, 6049, 1228, NA, 19789,~
## $ prob_death <dbl> 2030, 1, NA, 285, 0, NA, 1332, 9, NA, 1258, NA, NA, 0, ~
## $ new_deaths <dbl> 63, 2, 2, 91, 494, 0, 0, 8, 0, 6, 0, 32, 60, 6, 2, 39, ~
## $ pnew_death <dbl> 16, 0, 0, 1, 0, NA, 0, 0, 0, 0, NA, 0, 0, NA, 0, 7, 0, ~
## $ created_at <chr> "02/03/2021 02:55:58 PM", "07/31/2020 02:35:06 PM", "05~
## $ consent_cases <chr> "Agree", "Agree", "Not agree", "Not agree", "Agree", "N~
## $ consent_deaths <chr> "Agree", "Agree", "Not agree", "Agree", "Agree", "Not a~
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210801.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 28
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## date name newValue refValue absDelta pctDelta
## 1 2021-07-02 hosp_ped 662 597 65 0.10325655
## 2 2021-07-03 hosp_ped 638 597 41 0.06639676
##
##
## ***Differences of at least 0 and at least 0.1%
##
## state name newValue refValue absDelta pctDelta
## 1 AL inp 523814 518483 5331 0.010229330
## 2 TN inp 558512 559654 1142 0.002042631
## 3 NM inp 137802 137991 189 0.001370593
## 4 NH hosp_ped 271 361 90 0.284810127
## 5 ME hosp_ped 452 509 57 0.118626431
## 6 KY hosp_ped 5518 5308 210 0.038795492
## 7 MA hosp_ped 5015 5201 186 0.036413469
## 8 AR hosp_ped 5977 5840 137 0.023186934
## 9 TN hosp_ped 7924 8102 178 0.022213902
## 10 DE hosp_ped 1647 1683 36 0.021621622
## 11 AL hosp_ped 7711 7555 156 0.020437574
## 12 WV hosp_ped 2226 2269 43 0.019132369
## 13 KS hosp_ped 1711 1679 32 0.018879056
## 14 NV hosp_ped 1999 2037 38 0.018830525
## 15 AZ hosp_ped 11435 11266 169 0.014889212
## 16 VA hosp_ped 6604 6513 91 0.013875124
## 17 IN hosp_ped 6913 6826 87 0.012664677
## 18 MS hosp_ped 3727 3686 41 0.011061648
## 19 MO hosp_ped 15406 15241 165 0.010767775
## 20 SC hosp_ped 2706 2679 27 0.010027855
## 21 PA hosp_ped 19857 20010 153 0.007675521
## 22 WA hosp_ped 4288 4263 25 0.005847269
## 23 NM hosp_ped 3125 3107 18 0.005776637
## 24 IA hosp_ped 2275 2287 12 0.005260851
## 25 CO hosp_ped 9355 9401 46 0.004905097
## 26 NJ hosp_ped 9108 9142 34 0.003726027
## 27 OH hosp_ped 25500 25406 94 0.003693081
## 28 IL hosp_ped 19711 19644 67 0.003404904
## 29 GA hosp_ped 21902 21973 71 0.003236467
## 30 MT hosp_ped 1022 1025 3 0.002931119
## 31 PR hosp_ped 11353 11380 27 0.002375401
## 32 CA hosp_ped 30719 30667 52 0.001694197
## 33 LA hosp_ped 3174 3179 5 0.001574059
## 34 TX hosp_ped 38680 38739 59 0.001524174
## 35 FL hosp_ped 54840 54921 81 0.001475934
## 36 HI hosp_ped 720 721 1 0.001387925
## 37 NC hosp_ped 10619 10606 13 0.001224971
## 38 AL hosp_adult 443621 439848 3773 0.008541330
## 39 TN hosp_adult 494022 494969 947 0.001915083
## 40 NM hosp_adult 112634 112842 208 0.001844986
## 41 ME hosp_adult 37173 37121 52 0.001399844
## 42 WV hosp_adult 126618 126444 174 0.001375157
## 43 KY hosp_adult 299353 299757 404 0.001348667
## 44 NH hosp_adult 39064 39014 50 0.001280771
## 45 CA hosp_adult 2422197 2425080 2883 0.001189534
##
##
##
## Raw file for cdcHosp:
## Rows: 27,682
## Columns: 99
## $ state <chr> ~
## $ date <date> ~
## $ critical_staffing_shortage_today_yes <dbl> ~
## $ critical_staffing_shortage_today_no <dbl> ~
## $ critical_staffing_shortage_today_not_reported <dbl> ~
## $ critical_staffing_shortage_anticipated_within_week_yes <dbl> ~
## $ critical_staffing_shortage_anticipated_within_week_no <dbl> ~
## $ critical_staffing_shortage_anticipated_within_week_not_reported <dbl> ~
## $ hospital_onset_covid <dbl> ~
## $ hospital_onset_covid_coverage <dbl> ~
## $ inpatient_beds <dbl> ~
## $ inpatient_beds_coverage <dbl> ~
## $ inpatient_beds_used <dbl> ~
## $ inpatient_beds_used_coverage <dbl> ~
## $ inp <dbl> ~
## $ inpatient_beds_used_covid_coverage <dbl> ~
## $ previous_day_admission_adult_covid_confirmed <dbl> ~
## $ previous_day_admission_adult_covid_confirmed_coverage <dbl> ~
## $ previous_day_admission_adult_covid_suspected <dbl> ~
## $ previous_day_admission_adult_covid_suspected_coverage <dbl> ~
## $ previous_day_admission_pediatric_covid_confirmed <dbl> ~
## $ previous_day_admission_pediatric_covid_confirmed_coverage <dbl> ~
## $ previous_day_admission_pediatric_covid_suspected <dbl> ~
## $ previous_day_admission_pediatric_covid_suspected_coverage <dbl> ~
## $ staffed_adult_icu_bed_occupancy <dbl> ~
## $ staffed_adult_icu_bed_occupancy_coverage <dbl> ~
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid <dbl> ~
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage <dbl> ~
## $ staffed_icu_adult_patients_confirmed_covid <dbl> ~
## $ staffed_icu_adult_patients_confirmed_covid_coverage <dbl> ~
## $ hosp_adult <dbl> ~
## $ total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl> ~
## $ total_adult_patients_hospitalized_confirmed_covid <dbl> ~
## $ total_adult_patients_hospitalized_confirmed_covid_coverage <dbl> ~
## $ hosp_ped <dbl> ~
## $ total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl> ~
## $ total_pediatric_patients_hospitalized_confirmed_covid <dbl> ~
## $ total_pediatric_patients_hospitalized_confirmed_covid_coverage <dbl> ~
## $ total_staffed_adult_icu_beds <dbl> ~
## $ total_staffed_adult_icu_beds_coverage <dbl> ~
## $ inpatient_beds_utilization <dbl> ~
## $ inpatient_beds_utilization_coverage <dbl> ~
## $ inpatient_beds_utilization_numerator <dbl> ~
## $ inpatient_beds_utilization_denominator <dbl> ~
## $ percent_of_inpatients_with_covid <dbl> ~
## $ percent_of_inpatients_with_covid_coverage <dbl> ~
## $ percent_of_inpatients_with_covid_numerator <dbl> ~
## $ percent_of_inpatients_with_covid_denominator <dbl> ~
## $ inpatient_bed_covid_utilization <dbl> ~
## $ inpatient_bed_covid_utilization_coverage <dbl> ~
## $ inpatient_bed_covid_utilization_numerator <dbl> ~
## $ inpatient_bed_covid_utilization_denominator <dbl> ~
## $ adult_icu_bed_covid_utilization <dbl> ~
## $ adult_icu_bed_covid_utilization_coverage <dbl> ~
## $ adult_icu_bed_covid_utilization_numerator <dbl> ~
## $ adult_icu_bed_covid_utilization_denominator <dbl> ~
## $ adult_icu_bed_utilization <dbl> ~
## $ adult_icu_bed_utilization_coverage <dbl> ~
## $ adult_icu_bed_utilization_numerator <dbl> ~
## $ adult_icu_bed_utilization_denominator <dbl> ~
## $ geocoded_state <lgl> ~
## $ `previous_day_admission_adult_covid_confirmed_18-19` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_18-19_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_20-29` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_20-29_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_30-39` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_30-39_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_40-49` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_40-49_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_50-59` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_50-59_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_60-69` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_60-69_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_70-79` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_70-79_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_80+` <dbl> ~
## $ `previous_day_admission_adult_covid_confirmed_80+_coverage` <dbl> ~
## $ previous_day_admission_adult_covid_confirmed_unknown <dbl> ~
## $ previous_day_admission_adult_covid_confirmed_unknown_coverage <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_18-19` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_18-19_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_20-29` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_20-29_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_30-39` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_30-39_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_40-49` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_40-49_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_50-59` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_50-59_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_60-69` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_60-69_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_70-79` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_70-79_coverage` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_80+` <dbl> ~
## $ `previous_day_admission_adult_covid_suspected_80+_coverage` <dbl> ~
## $ previous_day_admission_adult_covid_suspected_unknown <dbl> ~
## $ previous_day_admission_adult_covid_suspected_unknown_coverage <dbl> ~
## $ deaths_covid <dbl> ~
## $ deaths_covid_coverage <dbl> ~
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/vaxData_downloaded_210801.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 4
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 1 and at least 1%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## [1] state name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
##
## Raw file for vax:
## Rows: 14,918
## Columns: 69
## $ date <date> 2021-07-31, 2021-07-31, 2021-0~
## $ MMWR_week <dbl> 30, 30, 30, 30, 30, 30, 30, 30,~
## $ state <chr> "AK", "NM", "PR", "RP", "MS", "~
## $ Distributed <dbl> 854805, 2449685, 4266370, 28650~
## $ Distributed_Janssen <dbl> 59300, 138500, 190000, 3800, 16~
## $ Distributed_Moderna <dbl> 366220, 1066860, 1853400, 20800~
## $ Distributed_Pfizer <dbl> 429285, 1244325, 2222970, 4050,~
## $ Distributed_Unk_Manuf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Dist_Per_100K <dbl> 116849, 116828, 133587, 159993,~
## $ Distributed_Per_100k_12Plus <dbl> 140390, 137051, 149787, 187353,~
## $ Distributed_Per_100k_18Plus <dbl> 154979, 151123, 162779, 205421,~
## $ Distributed_Per_100k_65Plus <dbl> 933315, 648741, 785808, 944298,~
## $ vxa <dbl> 697440, 2487536, 3975244, 26286~
## $ Administered_12Plus <dbl> 695366, 2487278, 3973905, 26286~
## $ Administered_18Plus <dbl> 653874, 2330829, 3705211, 25597~
## $ Administered_65Plus <dbl> 143058, 664378, 1048492, 3120, ~
## $ Administered_Janssen <dbl> 28437, 87616, 114212, 2145, 637~
## $ Administered_Moderna <dbl> 289116, 1081538, 1691040, 23441~
## $ Administered_Pfizer <dbl> 379706, 1313894, 2169642, 700, ~
## $ Administered_Unk_Manuf <dbl> 181, 4488, 350, 0, 1047, 727, 0~
## $ Administered_Fed_LTC <dbl> 6640, 39710, 74284, 0, 54224, 1~
## $ Administered_Fed_LTC_Residents <dbl> 2078, 11847, 11431, 0, 26288, 8~
## $ Administered_Fed_LTC_Staff <dbl> 1378, 12139, 10950, 0, 12915, 5~
## $ Administered_Fed_LTC_Unk <dbl> 3184, 15724, 51903, 0, 15021, 3~
## $ Administered_Fed_LTC_Dose1 <dbl> 4300, 24065, 53094, 0, 31843, 1~
## $ Administered_Fed_LTC_Dose1_Residents <dbl> 1383, 6414, 7925, 0, 14433, 507~
## $ Administered_Fed_LTC_Dose1_Staff <dbl> 956, 6649, 7461, 0, 7685, 36183~
## $ Administered_Fed_LTC_Dose1_Unk <dbl> 1961, 11002, 37708, 0, 9725, 29~
## $ Admin_Per_100k <dbl> 95338, 118633, 124472, 146792, ~
## $ Admin_Per_100k_12Plus <dbl> 114205, 139154, 139519, 171894,~
## $ Admin_Per_100k_18Plus <dbl> 118550, 143790, 141368, 183531,~
## $ Admin_Per_100k_65Plus <dbl> 156197, 175945, 193118, 102835,~
## $ Recip_Administered <dbl> 692140, 2511859, 4003254, 26519~
## $ Administered_Dose1_Recip <dbl> 376882, 1374231, 2197391, 15199~
## $ Administered_Dose1_Pop_Pct <dbl> 51.5, 65.5, 68.8, 84.9, 39.8, 5~
## $ Administered_Dose1_Recip_12Plus <dbl> 375603, 1373997, 2196318, 15199~
## $ Administered_Dose1_Recip_12PlusPop_Pct <dbl> 61.7, 76.9, 77.1, 99.4, 46.9, 6~
## $ Administered_Dose1_Recip_18Plus <dbl> 352303, 1282733, 2045507, 14508~
## $ Administered_Dose1_Recip_18PlusPop_Pct <dbl> 63.9, 79.1, 78.0, 99.9, 50.0, 6~
## $ Administered_Dose1_Recip_65Plus <dbl> 75867, 359328, 564728, 1707, 38~
## $ Administered_Dose1_Recip_65PlusPop_Pct <dbl> 82.8, 95.2, 99.9, 56.3, 78.6, 9~
## $ vxc <dbl> 333092, 1198386, 1911719, 13461~
## $ vxcpoppct <dbl> 45.5, 57.2, 59.9, 75.2, 34.5, 5~
## $ Series_Complete_12Plus <dbl> 332299, 1198314, 1911408, 13461~
## $ Series_Complete_12PlusPop_Pct <dbl> 54.6, 67.0, 67.1, 88.0, 40.7, 6~
## $ vxcgte18 <dbl> 314089, 1126808, 1790751, 13461~
## $ vxcgte18pct <dbl> 56.9, 69.5, 68.3, 96.5, 44.0, 6~
## $ vxcgte65 <dbl> 71390, 324959, 506358, 1671, 35~
## $ vxcgte65pct <dbl> 77.9, 86.1, 93.3, 55.1, 72.7, 8~
## $ Series_Complete_Janssen <dbl> 26386, 86064, 113821, 2148, 627~
## $ Series_Complete_Moderna <dbl> 133838, 492367, 784508, 11283, ~
## $ Series_Complete_Pfizer <dbl> 172824, 618609, 1013352, 30, 52~
## $ Series_Complete_Unk_Manuf <dbl> 44, 1346, 38, 0, 138, 318, 0, 1~
## $ Series_Complete_Janssen_12Plus <dbl> 26384, 86052, 113778, 2148, 627~
## $ Series_Complete_Moderna_12Plus <dbl> 133834, 492349, 784456, 11283, ~
## $ Series_Complete_Pfizer_12Plus <dbl> 172037, 618567, 1013136, 30, 52~
## $ Series_Complete_Unk_Manuf_12Plus <dbl> 44, 1346, 38, 0, 138, 318, 0, 1~
## $ Series_Complete_Janssen_18Plus <dbl> 26261, 85937, 113673, 2148, 626~
## $ Series_Complete_Moderna_18Plus <dbl> 133469, 492021, 784182, 11283, ~
## $ Series_Complete_Pfizer_18Plus <dbl> 154318, 547516, 892859, 30, 495~
## $ Series_Complete_Unk_Manuf_18Plus <dbl> 41, 1334, 37, 0, 133, 309, 0, 1~
## $ Series_Complete_Janssen_65Plus <dbl> 2638, 18271, 19965, 212, 15374,~
## $ Series_Complete_Moderna_65Plus <dbl> 40125, 151102, 257623, 1450, 18~
## $ Series_Complete_Pfizer_65Plus <dbl> 28605, 154857, 228764, 9, 15393~
## $ Series_Complete_Unk_Manuf_65Plus <dbl> 22, 729, 6, 0, 60, 135, 0, 734,~
## $ Series_Complete_FedLTC <dbl> 2320, 15515, 21185, 0, 22390, 6~
## $ Series_Complete_FedLTC_Residents <dbl> 676, 5246, 3503, 0, 11688, 3467~
## $ Series_Complete_FedLTC_Staff <dbl> 425, 5319, 3488, 0, 5176, 23209~
## $ Series_Complete_FedLTC_Unknown <dbl> 1219, 4950, 14194, 0, 5526, 782~
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 8.21e+9 1.64e+8 3.49e+7 604596 32804
## 2 after 8.17e+9 1.63e+8 3.47e+7 601834 28356
## 3 pctchg 4.40e-3 3.96e-3 4.58e-3 0.00457 0.136
##
##
## Processed for cdcDaily:
## Rows: 28,356
## Columns: 6
## $ date <date> 2021-02-02, 2020-07-30, 2020-05-03, 2020-12-04, 2021-01-28~
## $ state <chr> "IL", "ME", "NH", "IN", "CA", "CT", "WI", "NV", "MI", "MI",~
## $ tot_cases <dbl> 1130917, 3910, 2518, 367338, 3409079, 267337, 98440, 324132~
## $ tot_deaths <dbl> 21336, 123, 86, 7031, 49603, 7381, 1237, 5586, 21047, 0, 11~
## $ new_cases <dbl> 2304, 22, 89, 7899, 18703, 0, 1502, 128, 199, 0, 394, 3436,~
## $ new_deaths <dbl> 63, 2, 2, 91, 494, 0, 8, 0, 6, 0, 32, 60, 6, 2, 39, 66, 0, ~
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.78e+7 2.19e+7 471723 27682
## 2 after 2.77e+7 2.18e+7 459822 26679
## 3 pctchg 5.58e-3 5.57e-3 0.0252 0.0362
##
##
## Processed for cdcHosp:
## Rows: 26,679
## Columns: 5
## $ date <date> 2020-07-22, 2020-07-20, 2020-07-19, 2020-07-18, 2020-07-18~
## $ state <chr> "IA", "IA", "ND", "IA", "ND", "TX", "OK", "CT", "ND", "NM",~
## $ inp <dbl> 0, 1, 46, 10, 33, 12003, 678, 215, 16, 119, 51, 19, 250, 14~
## $ hosp_adult <dbl> 0, 1, NA, 10, NA, 7999, 566, 115, NA, NA, NA, NA, NA, NA, N~
## $ hosp_ped <dbl> 0, 0, NA, 0, NA, 194, 9, 0, NA, NA, NA, NA, NA, NA, NA, NA,~
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 9
## isType vxa vxc vxcpoppct vxcgte65 vxcgte65pct vxcgte18 vxcgte18pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 8.19e+10 3.37e+10 302401. 1.10e+10 561791. 3.28e+10 373517.
## 2 after 3.89e+10 1.63e+10 255914. 5.34e+ 9 512134. 1.59e+10 320269.
## 3 pctchg 5.24e- 1 5.16e- 1 0.154 5.16e- 1 0.0884 5.16e- 1 0.143
## # ... with 1 more variable: n <dbl>
##
##
## Processed for vax:
## Rows: 11,730
## Columns: 9
## $ date <date> 2021-07-31, 2021-07-31, 2021-07-31, 2021-07-31, 2021-07-3~
## $ state <chr> "AK", "NM", "MS", "WI", "NY", "OK", "MD", "NH", "WV", "AL"~
## $ vxa <dbl> 697440, 2487536, 2150026, 6163565, 22950250, 3460262, 7213~
## $ vxc <dbl> 333092, 1198386, 1026837, 3015017, 11109858, 1593194, 3559~
## $ vxcpoppct <dbl> 45.5, 57.2, 34.5, 51.8, 57.1, 40.3, 58.9, 58.3, 39.0, 34.3~
## $ vxcgte65 <dbl> 71390, 324959, 353642, 895738, 2663975, 482309, 844122, 22~
## $ vxcgte65pct <dbl> 77.9, 86.1, 72.7, 88.1, 80.8, 75.9, 88.0, 87.4, 70.2, 69.5~
## $ vxcgte18 <dbl> 314089, 1126808, 1001545, 2875753, 10577357, 1533309, 3342~
## $ vxcgte18pct <dbl> 56.9, 69.5, 44.0, 63.1, 68.6, 51.0, 71.0, 68.2, 46.9, 43.1~
##
## Integrated per capita data file:
## Rows: 28,569
## Columns: 34
## $ date <date> 2020-01-01, 2020-01-01, 2020-01-01, 2020-01-01, 2020-01-0~
## $ state <chr> "AL", "HI", "IN", "LA", "MN", "MT", "NC", "TX", "AL", "HI"~
## $ tot_cases <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ tot_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ new_cases <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ new_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ inp <dbl> NA, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, NA, 0, 0, 0, 1877, 0, ~
## $ hosp_adult <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ hosp_ped <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxa <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcpoppct <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcgte65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcgte65pct <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcgte18 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcgte18pct <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ tcpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ tdpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ cpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ dpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ hpm <dbl> NA, 0.0000, 0.0000, NA, 0.0000, 0.0000, 0.0000, 0.0000, NA~
## $ ahpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ phpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxapm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ tcpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ tdpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ cpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ dpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ ahpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ phpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxapm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ vxcpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
cdc_daily_210801$plotDataList$dfAgg %>%
select(cluster, date, wm_vxcpoppct=wm_vxcpm7, wm_vxcgte65pct) %>%
mutate(wm_vxcpoppct=wm_vxcpoppct/10000) %>%
pivot_longer(-c(cluster, date)) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=cluster, color=cluster), size=1) +
facet_wrap(~c("wm_vxcgte65pct"="Population 65+", "wm_vxcpoppct"="Total population")[name]) +
scale_color_discrete("Cluster") +
lims(y=c(0, 100)) +
labs(x=NULL, y="Percent fully vaccinated", title="Fully vaccinated by cluster")
cdc_daily_210801$plotDataList$dfFull %>%
filter(date==max(date)) %>%
select(state, cluster, vxcpoppct, vxcgte65pct, vxcgte18pct) %>%
pivot_longer(-c(state, cluster)) %>%
ggplot(aes(x=fct_reorder(state, value, min),
y=value
)
) +
geom_col(aes(fill=cluster)) +
facet_wrap(~~c("vxcgte65pct"="3. Age 65+", "vxcgte18pct"="2. Age 18+", "vxcpoppct"="1. Everyone")[name]) +
coord_flip() +
labs(x=NULL,
y="Fully vaccinated (%)",
title="Fully vaccinated (%) by state and age cohort as of Jul 31, 2021",
subtitle="Dotted lines at 35% and 70%"
) +
lims(y=c(0, 100)) +
geom_hline(yintercept=c(35, 70), lty=2) +
scale_fill_brewer("Cluster", palette="Paired")
saveToRDS(cdc_daily_210801, ovrWrite=FALSE, ovrWriteError=FALSE)
The updated process works as intended. Next steps are to update the .R files with the new functions.